Merge tag 'renesas-arm-dt-for-v5.7-tag2' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / fs / ext4 / extents.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4  * Written by Alex Tomas <alex@clusterfs.com>
5  *
6  * Architecture independence:
7  *   Copyright (c) 2005, Bull S.A.
8  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
9  */
10
11 /*
12  * Extents support for EXT4
13  *
14  * TODO:
15  *   - ext4*_error() should be used in some situations
16  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
17  *   - smart tree reduction
18  */
19
20 #include <linux/fs.h>
21 #include <linux/time.h>
22 #include <linux/jbd2.h>
23 #include <linux/highuid.h>
24 #include <linux/pagemap.h>
25 #include <linux/quotaops.h>
26 #include <linux/string.h>
27 #include <linux/slab.h>
28 #include <linux/uaccess.h>
29 #include <linux/fiemap.h>
30 #include <linux/backing-dev.h>
31 #include "ext4_jbd2.h"
32 #include "ext4_extents.h"
33 #include "xattr.h"
34
35 #include <trace/events/ext4.h>
36
37 /*
38  * used by extent splitting.
39  */
40 #define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
41                                         due to ENOSPC */
42 #define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
43 #define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
44
45 #define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
46 #define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
47
48 static __le32 ext4_extent_block_csum(struct inode *inode,
49                                      struct ext4_extent_header *eh)
50 {
51         struct ext4_inode_info *ei = EXT4_I(inode);
52         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
53         __u32 csum;
54
55         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
56                            EXT4_EXTENT_TAIL_OFFSET(eh));
57         return cpu_to_le32(csum);
58 }
59
60 static int ext4_extent_block_csum_verify(struct inode *inode,
61                                          struct ext4_extent_header *eh)
62 {
63         struct ext4_extent_tail *et;
64
65         if (!ext4_has_metadata_csum(inode->i_sb))
66                 return 1;
67
68         et = find_ext4_extent_tail(eh);
69         if (et->et_checksum != ext4_extent_block_csum(inode, eh))
70                 return 0;
71         return 1;
72 }
73
74 static void ext4_extent_block_csum_set(struct inode *inode,
75                                        struct ext4_extent_header *eh)
76 {
77         struct ext4_extent_tail *et;
78
79         if (!ext4_has_metadata_csum(inode->i_sb))
80                 return;
81
82         et = find_ext4_extent_tail(eh);
83         et->et_checksum = ext4_extent_block_csum(inode, eh);
84 }
85
86 static int ext4_split_extent(handle_t *handle,
87                                 struct inode *inode,
88                                 struct ext4_ext_path **ppath,
89                                 struct ext4_map_blocks *map,
90                                 int split_flag,
91                                 int flags);
92
93 static int ext4_split_extent_at(handle_t *handle,
94                              struct inode *inode,
95                              struct ext4_ext_path **ppath,
96                              ext4_lblk_t split,
97                              int split_flag,
98                              int flags);
99
100 static int ext4_find_delayed_extent(struct inode *inode,
101                                     struct extent_status *newes);
102
103 static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
104 {
105         /*
106          * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
107          * moment, get_block can be called only for blocks inside i_size since
108          * page cache has been already dropped and writes are blocked by
109          * i_mutex. So we can safely drop the i_data_sem here.
110          */
111         BUG_ON(EXT4_JOURNAL(inode) == NULL);
112         ext4_discard_preallocations(inode);
113         up_write(&EXT4_I(inode)->i_data_sem);
114         *dropped = 1;
115         return 0;
116 }
117
118 /*
119  * Make sure 'handle' has at least 'check_cred' credits. If not, restart
120  * transaction with 'restart_cred' credits. The function drops i_data_sem
121  * when restarting transaction and gets it after transaction is restarted.
122  *
123  * The function returns 0 on success, 1 if transaction had to be restarted,
124  * and < 0 in case of fatal error.
125  */
126 int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
127                                 int check_cred, int restart_cred,
128                                 int revoke_cred)
129 {
130         int ret;
131         int dropped = 0;
132
133         ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
134                 revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
135         if (dropped)
136                 down_write(&EXT4_I(inode)->i_data_sem);
137         return ret;
138 }
139
140 /*
141  * could return:
142  *  - EROFS
143  *  - ENOMEM
144  */
145 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
146                                 struct ext4_ext_path *path)
147 {
148         if (path->p_bh) {
149                 /* path points to block */
150                 BUFFER_TRACE(path->p_bh, "get_write_access");
151                 return ext4_journal_get_write_access(handle, path->p_bh);
152         }
153         /* path points to leaf/index in inode body */
154         /* we use in-core data, no need to protect them */
155         return 0;
156 }
157
158 /*
159  * could return:
160  *  - EROFS
161  *  - ENOMEM
162  *  - EIO
163  */
164 static int __ext4_ext_dirty(const char *where, unsigned int line,
165                             handle_t *handle, struct inode *inode,
166                             struct ext4_ext_path *path)
167 {
168         int err;
169
170         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
171         if (path->p_bh) {
172                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
173                 /* path points to block */
174                 err = __ext4_handle_dirty_metadata(where, line, handle,
175                                                    inode, path->p_bh);
176         } else {
177                 /* path points to leaf/index in inode body */
178                 err = ext4_mark_inode_dirty(handle, inode);
179         }
180         return err;
181 }
182
183 #define ext4_ext_dirty(handle, inode, path) \
184                 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
185
186 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
187                               struct ext4_ext_path *path,
188                               ext4_lblk_t block)
189 {
190         if (path) {
191                 int depth = path->p_depth;
192                 struct ext4_extent *ex;
193
194                 /*
195                  * Try to predict block placement assuming that we are
196                  * filling in a file which will eventually be
197                  * non-sparse --- i.e., in the case of libbfd writing
198                  * an ELF object sections out-of-order but in a way
199                  * the eventually results in a contiguous object or
200                  * executable file, or some database extending a table
201                  * space file.  However, this is actually somewhat
202                  * non-ideal if we are writing a sparse file such as
203                  * qemu or KVM writing a raw image file that is going
204                  * to stay fairly sparse, since it will end up
205                  * fragmenting the file system's free space.  Maybe we
206                  * should have some hueristics or some way to allow
207                  * userspace to pass a hint to file system,
208                  * especially if the latter case turns out to be
209                  * common.
210                  */
211                 ex = path[depth].p_ext;
212                 if (ex) {
213                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
214                         ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
215
216                         if (block > ext_block)
217                                 return ext_pblk + (block - ext_block);
218                         else
219                                 return ext_pblk - (ext_block - block);
220                 }
221
222                 /* it looks like index is empty;
223                  * try to find starting block from index itself */
224                 if (path[depth].p_bh)
225                         return path[depth].p_bh->b_blocknr;
226         }
227
228         /* OK. use inode's group */
229         return ext4_inode_to_goal_block(inode);
230 }
231
232 /*
233  * Allocation for a meta data block
234  */
235 static ext4_fsblk_t
236 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
237                         struct ext4_ext_path *path,
238                         struct ext4_extent *ex, int *err, unsigned int flags)
239 {
240         ext4_fsblk_t goal, newblock;
241
242         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
243         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
244                                         NULL, err);
245         return newblock;
246 }
247
248 static inline int ext4_ext_space_block(struct inode *inode, int check)
249 {
250         int size;
251
252         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
253                         / sizeof(struct ext4_extent);
254 #ifdef AGGRESSIVE_TEST
255         if (!check && size > 6)
256                 size = 6;
257 #endif
258         return size;
259 }
260
261 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
262 {
263         int size;
264
265         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
266                         / sizeof(struct ext4_extent_idx);
267 #ifdef AGGRESSIVE_TEST
268         if (!check && size > 5)
269                 size = 5;
270 #endif
271         return size;
272 }
273
274 static inline int ext4_ext_space_root(struct inode *inode, int check)
275 {
276         int size;
277
278         size = sizeof(EXT4_I(inode)->i_data);
279         size -= sizeof(struct ext4_extent_header);
280         size /= sizeof(struct ext4_extent);
281 #ifdef AGGRESSIVE_TEST
282         if (!check && size > 3)
283                 size = 3;
284 #endif
285         return size;
286 }
287
288 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
289 {
290         int size;
291
292         size = sizeof(EXT4_I(inode)->i_data);
293         size -= sizeof(struct ext4_extent_header);
294         size /= sizeof(struct ext4_extent_idx);
295 #ifdef AGGRESSIVE_TEST
296         if (!check && size > 4)
297                 size = 4;
298 #endif
299         return size;
300 }
301
302 static inline int
303 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
304                            struct ext4_ext_path **ppath, ext4_lblk_t lblk,
305                            int nofail)
306 {
307         struct ext4_ext_path *path = *ppath;
308         int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
309
310         return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
311                         EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
312                         EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
313                         (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
314 }
315
316 static int
317 ext4_ext_max_entries(struct inode *inode, int depth)
318 {
319         int max;
320
321         if (depth == ext_depth(inode)) {
322                 if (depth == 0)
323                         max = ext4_ext_space_root(inode, 1);
324                 else
325                         max = ext4_ext_space_root_idx(inode, 1);
326         } else {
327                 if (depth == 0)
328                         max = ext4_ext_space_block(inode, 1);
329                 else
330                         max = ext4_ext_space_block_idx(inode, 1);
331         }
332
333         return max;
334 }
335
336 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
337 {
338         ext4_fsblk_t block = ext4_ext_pblock(ext);
339         int len = ext4_ext_get_actual_len(ext);
340         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
341
342         /*
343          * We allow neither:
344          *  - zero length
345          *  - overflow/wrap-around
346          */
347         if (lblock + len <= lblock)
348                 return 0;
349         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
350 }
351
352 static int ext4_valid_extent_idx(struct inode *inode,
353                                 struct ext4_extent_idx *ext_idx)
354 {
355         ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
356
357         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
358 }
359
360 static int ext4_valid_extent_entries(struct inode *inode,
361                                 struct ext4_extent_header *eh,
362                                 int depth)
363 {
364         unsigned short entries;
365         if (eh->eh_entries == 0)
366                 return 1;
367
368         entries = le16_to_cpu(eh->eh_entries);
369
370         if (depth == 0) {
371                 /* leaf entries */
372                 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
373                 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
374                 ext4_fsblk_t pblock = 0;
375                 ext4_lblk_t lblock = 0;
376                 ext4_lblk_t prev = 0;
377                 int len = 0;
378                 while (entries) {
379                         if (!ext4_valid_extent(inode, ext))
380                                 return 0;
381
382                         /* Check for overlapping extents */
383                         lblock = le32_to_cpu(ext->ee_block);
384                         len = ext4_ext_get_actual_len(ext);
385                         if ((lblock <= prev) && prev) {
386                                 pblock = ext4_ext_pblock(ext);
387                                 es->s_last_error_block = cpu_to_le64(pblock);
388                                 return 0;
389                         }
390                         ext++;
391                         entries--;
392                         prev = lblock + len - 1;
393                 }
394         } else {
395                 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
396                 while (entries) {
397                         if (!ext4_valid_extent_idx(inode, ext_idx))
398                                 return 0;
399                         ext_idx++;
400                         entries--;
401                 }
402         }
403         return 1;
404 }
405
406 static int __ext4_ext_check(const char *function, unsigned int line,
407                             struct inode *inode, struct ext4_extent_header *eh,
408                             int depth, ext4_fsblk_t pblk)
409 {
410         const char *error_msg;
411         int max = 0, err = -EFSCORRUPTED;
412
413         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
414                 error_msg = "invalid magic";
415                 goto corrupted;
416         }
417         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
418                 error_msg = "unexpected eh_depth";
419                 goto corrupted;
420         }
421         if (unlikely(eh->eh_max == 0)) {
422                 error_msg = "invalid eh_max";
423                 goto corrupted;
424         }
425         max = ext4_ext_max_entries(inode, depth);
426         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
427                 error_msg = "too large eh_max";
428                 goto corrupted;
429         }
430         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
431                 error_msg = "invalid eh_entries";
432                 goto corrupted;
433         }
434         if (!ext4_valid_extent_entries(inode, eh, depth)) {
435                 error_msg = "invalid extent entries";
436                 goto corrupted;
437         }
438         if (unlikely(depth > 32)) {
439                 error_msg = "too large eh_depth";
440                 goto corrupted;
441         }
442         /* Verify checksum on non-root extent tree nodes */
443         if (ext_depth(inode) != depth &&
444             !ext4_extent_block_csum_verify(inode, eh)) {
445                 error_msg = "extent tree corrupted";
446                 err = -EFSBADCRC;
447                 goto corrupted;
448         }
449         return 0;
450
451 corrupted:
452         ext4_set_errno(inode->i_sb, -err);
453         ext4_error_inode(inode, function, line, 0,
454                          "pblk %llu bad header/extent: %s - magic %x, "
455                          "entries %u, max %u(%u), depth %u(%u)",
456                          (unsigned long long) pblk, error_msg,
457                          le16_to_cpu(eh->eh_magic),
458                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
459                          max, le16_to_cpu(eh->eh_depth), depth);
460         return err;
461 }
462
463 #define ext4_ext_check(inode, eh, depth, pblk)                  \
464         __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
465
466 int ext4_ext_check_inode(struct inode *inode)
467 {
468         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
469 }
470
471 static void ext4_cache_extents(struct inode *inode,
472                                struct ext4_extent_header *eh)
473 {
474         struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
475         ext4_lblk_t prev = 0;
476         int i;
477
478         for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
479                 unsigned int status = EXTENT_STATUS_WRITTEN;
480                 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
481                 int len = ext4_ext_get_actual_len(ex);
482
483                 if (prev && (prev != lblk))
484                         ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
485                                              EXTENT_STATUS_HOLE);
486
487                 if (ext4_ext_is_unwritten(ex))
488                         status = EXTENT_STATUS_UNWRITTEN;
489                 ext4_es_cache_extent(inode, lblk, len,
490                                      ext4_ext_pblock(ex), status);
491                 prev = lblk + len;
492         }
493 }
494
495 static struct buffer_head *
496 __read_extent_tree_block(const char *function, unsigned int line,
497                          struct inode *inode, ext4_fsblk_t pblk, int depth,
498                          int flags)
499 {
500         struct buffer_head              *bh;
501         int                             err;
502
503         bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
504         if (unlikely(!bh))
505                 return ERR_PTR(-ENOMEM);
506
507         if (!bh_uptodate_or_lock(bh)) {
508                 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
509                 err = bh_submit_read(bh);
510                 if (err < 0)
511                         goto errout;
512         }
513         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
514                 return bh;
515         if (!ext4_has_feature_journal(inode->i_sb) ||
516             (inode->i_ino !=
517              le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
518                 err = __ext4_ext_check(function, line, inode,
519                                        ext_block_hdr(bh), depth, pblk);
520                 if (err)
521                         goto errout;
522         }
523         set_buffer_verified(bh);
524         /*
525          * If this is a leaf block, cache all of its entries
526          */
527         if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
528                 struct ext4_extent_header *eh = ext_block_hdr(bh);
529                 ext4_cache_extents(inode, eh);
530         }
531         return bh;
532 errout:
533         put_bh(bh);
534         return ERR_PTR(err);
535
536 }
537
538 #define read_extent_tree_block(inode, pblk, depth, flags)               \
539         __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
540                                  (depth), (flags))
541
542 /*
543  * This function is called to cache a file's extent information in the
544  * extent status tree
545  */
546 int ext4_ext_precache(struct inode *inode)
547 {
548         struct ext4_inode_info *ei = EXT4_I(inode);
549         struct ext4_ext_path *path = NULL;
550         struct buffer_head *bh;
551         int i = 0, depth, ret = 0;
552
553         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
554                 return 0;       /* not an extent-mapped inode */
555
556         down_read(&ei->i_data_sem);
557         depth = ext_depth(inode);
558
559         path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
560                        GFP_NOFS);
561         if (path == NULL) {
562                 up_read(&ei->i_data_sem);
563                 return -ENOMEM;
564         }
565
566         /* Don't cache anything if there are no external extent blocks */
567         if (depth == 0)
568                 goto out;
569         path[0].p_hdr = ext_inode_hdr(inode);
570         ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
571         if (ret)
572                 goto out;
573         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
574         while (i >= 0) {
575                 /*
576                  * If this is a leaf block or we've reached the end of
577                  * the index block, go up
578                  */
579                 if ((i == depth) ||
580                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
581                         brelse(path[i].p_bh);
582                         path[i].p_bh = NULL;
583                         i--;
584                         continue;
585                 }
586                 bh = read_extent_tree_block(inode,
587                                             ext4_idx_pblock(path[i].p_idx++),
588                                             depth - i - 1,
589                                             EXT4_EX_FORCE_CACHE);
590                 if (IS_ERR(bh)) {
591                         ret = PTR_ERR(bh);
592                         break;
593                 }
594                 i++;
595                 path[i].p_bh = bh;
596                 path[i].p_hdr = ext_block_hdr(bh);
597                 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
598         }
599         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
600 out:
601         up_read(&ei->i_data_sem);
602         ext4_ext_drop_refs(path);
603         kfree(path);
604         return ret;
605 }
606
607 #ifdef EXT_DEBUG
608 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
609 {
610         int k, l = path->p_depth;
611
612         ext_debug("path:");
613         for (k = 0; k <= l; k++, path++) {
614                 if (path->p_idx) {
615                         ext_debug("  %d->%llu",
616                                   le32_to_cpu(path->p_idx->ei_block),
617                                   ext4_idx_pblock(path->p_idx));
618                 } else if (path->p_ext) {
619                         ext_debug("  %d:[%d]%d:%llu ",
620                                   le32_to_cpu(path->p_ext->ee_block),
621                                   ext4_ext_is_unwritten(path->p_ext),
622                                   ext4_ext_get_actual_len(path->p_ext),
623                                   ext4_ext_pblock(path->p_ext));
624                 } else
625                         ext_debug("  []");
626         }
627         ext_debug("\n");
628 }
629
630 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
631 {
632         int depth = ext_depth(inode);
633         struct ext4_extent_header *eh;
634         struct ext4_extent *ex;
635         int i;
636
637         if (!path)
638                 return;
639
640         eh = path[depth].p_hdr;
641         ex = EXT_FIRST_EXTENT(eh);
642
643         ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
644
645         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
646                 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
647                           ext4_ext_is_unwritten(ex),
648                           ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
649         }
650         ext_debug("\n");
651 }
652
653 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
654                         ext4_fsblk_t newblock, int level)
655 {
656         int depth = ext_depth(inode);
657         struct ext4_extent *ex;
658
659         if (depth != level) {
660                 struct ext4_extent_idx *idx;
661                 idx = path[level].p_idx;
662                 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
663                         ext_debug("%d: move %d:%llu in new index %llu\n", level,
664                                         le32_to_cpu(idx->ei_block),
665                                         ext4_idx_pblock(idx),
666                                         newblock);
667                         idx++;
668                 }
669
670                 return;
671         }
672
673         ex = path[depth].p_ext;
674         while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
675                 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
676                                 le32_to_cpu(ex->ee_block),
677                                 ext4_ext_pblock(ex),
678                                 ext4_ext_is_unwritten(ex),
679                                 ext4_ext_get_actual_len(ex),
680                                 newblock);
681                 ex++;
682         }
683 }
684
685 #else
686 #define ext4_ext_show_path(inode, path)
687 #define ext4_ext_show_leaf(inode, path)
688 #define ext4_ext_show_move(inode, path, newblock, level)
689 #endif
690
691 void ext4_ext_drop_refs(struct ext4_ext_path *path)
692 {
693         int depth, i;
694
695         if (!path)
696                 return;
697         depth = path->p_depth;
698         for (i = 0; i <= depth; i++, path++) {
699                 if (path->p_bh) {
700                         brelse(path->p_bh);
701                         path->p_bh = NULL;
702                 }
703         }
704 }
705
706 /*
707  * ext4_ext_binsearch_idx:
708  * binary search for the closest index of the given block
709  * the header must be checked before calling this
710  */
711 static void
712 ext4_ext_binsearch_idx(struct inode *inode,
713                         struct ext4_ext_path *path, ext4_lblk_t block)
714 {
715         struct ext4_extent_header *eh = path->p_hdr;
716         struct ext4_extent_idx *r, *l, *m;
717
718
719         ext_debug("binsearch for %u(idx):  ", block);
720
721         l = EXT_FIRST_INDEX(eh) + 1;
722         r = EXT_LAST_INDEX(eh);
723         while (l <= r) {
724                 m = l + (r - l) / 2;
725                 if (block < le32_to_cpu(m->ei_block))
726                         r = m - 1;
727                 else
728                         l = m + 1;
729                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
730                                 m, le32_to_cpu(m->ei_block),
731                                 r, le32_to_cpu(r->ei_block));
732         }
733
734         path->p_idx = l - 1;
735         ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
736                   ext4_idx_pblock(path->p_idx));
737
738 #ifdef CHECK_BINSEARCH
739         {
740                 struct ext4_extent_idx *chix, *ix;
741                 int k;
742
743                 chix = ix = EXT_FIRST_INDEX(eh);
744                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
745                         if (k != 0 && le32_to_cpu(ix->ei_block) <=
746                             le32_to_cpu(ix[-1].ei_block)) {
747                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
748                                        "first=0x%p\n", k,
749                                        ix, EXT_FIRST_INDEX(eh));
750                                 printk(KERN_DEBUG "%u <= %u\n",
751                                        le32_to_cpu(ix->ei_block),
752                                        le32_to_cpu(ix[-1].ei_block));
753                         }
754                         BUG_ON(k && le32_to_cpu(ix->ei_block)
755                                            <= le32_to_cpu(ix[-1].ei_block));
756                         if (block < le32_to_cpu(ix->ei_block))
757                                 break;
758                         chix = ix;
759                 }
760                 BUG_ON(chix != path->p_idx);
761         }
762 #endif
763
764 }
765
766 /*
767  * ext4_ext_binsearch:
768  * binary search for closest extent of the given block
769  * the header must be checked before calling this
770  */
771 static void
772 ext4_ext_binsearch(struct inode *inode,
773                 struct ext4_ext_path *path, ext4_lblk_t block)
774 {
775         struct ext4_extent_header *eh = path->p_hdr;
776         struct ext4_extent *r, *l, *m;
777
778         if (eh->eh_entries == 0) {
779                 /*
780                  * this leaf is empty:
781                  * we get such a leaf in split/add case
782                  */
783                 return;
784         }
785
786         ext_debug("binsearch for %u:  ", block);
787
788         l = EXT_FIRST_EXTENT(eh) + 1;
789         r = EXT_LAST_EXTENT(eh);
790
791         while (l <= r) {
792                 m = l + (r - l) / 2;
793                 if (block < le32_to_cpu(m->ee_block))
794                         r = m - 1;
795                 else
796                         l = m + 1;
797                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
798                                 m, le32_to_cpu(m->ee_block),
799                                 r, le32_to_cpu(r->ee_block));
800         }
801
802         path->p_ext = l - 1;
803         ext_debug("  -> %d:%llu:[%d]%d ",
804                         le32_to_cpu(path->p_ext->ee_block),
805                         ext4_ext_pblock(path->p_ext),
806                         ext4_ext_is_unwritten(path->p_ext),
807                         ext4_ext_get_actual_len(path->p_ext));
808
809 #ifdef CHECK_BINSEARCH
810         {
811                 struct ext4_extent *chex, *ex;
812                 int k;
813
814                 chex = ex = EXT_FIRST_EXTENT(eh);
815                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
816                         BUG_ON(k && le32_to_cpu(ex->ee_block)
817                                           <= le32_to_cpu(ex[-1].ee_block));
818                         if (block < le32_to_cpu(ex->ee_block))
819                                 break;
820                         chex = ex;
821                 }
822                 BUG_ON(chex != path->p_ext);
823         }
824 #endif
825
826 }
827
828 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
829 {
830         struct ext4_extent_header *eh;
831
832         eh = ext_inode_hdr(inode);
833         eh->eh_depth = 0;
834         eh->eh_entries = 0;
835         eh->eh_magic = EXT4_EXT_MAGIC;
836         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
837         ext4_mark_inode_dirty(handle, inode);
838         return 0;
839 }
840
841 struct ext4_ext_path *
842 ext4_find_extent(struct inode *inode, ext4_lblk_t block,
843                  struct ext4_ext_path **orig_path, int flags)
844 {
845         struct ext4_extent_header *eh;
846         struct buffer_head *bh;
847         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
848         short int depth, i, ppos = 0;
849         int ret;
850
851         eh = ext_inode_hdr(inode);
852         depth = ext_depth(inode);
853         if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
854                 EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
855                                  depth);
856                 ret = -EFSCORRUPTED;
857                 goto err;
858         }
859
860         if (path) {
861                 ext4_ext_drop_refs(path);
862                 if (depth > path[0].p_maxdepth) {
863                         kfree(path);
864                         *orig_path = path = NULL;
865                 }
866         }
867         if (!path) {
868                 /* account possible depth increase */
869                 path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
870                                 GFP_NOFS);
871                 if (unlikely(!path))
872                         return ERR_PTR(-ENOMEM);
873                 path[0].p_maxdepth = depth + 1;
874         }
875         path[0].p_hdr = eh;
876         path[0].p_bh = NULL;
877
878         i = depth;
879         if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
880                 ext4_cache_extents(inode, eh);
881         /* walk through the tree */
882         while (i) {
883                 ext_debug("depth %d: num %d, max %d\n",
884                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
885
886                 ext4_ext_binsearch_idx(inode, path + ppos, block);
887                 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
888                 path[ppos].p_depth = i;
889                 path[ppos].p_ext = NULL;
890
891                 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
892                                             flags);
893                 if (IS_ERR(bh)) {
894                         ret = PTR_ERR(bh);
895                         goto err;
896                 }
897
898                 eh = ext_block_hdr(bh);
899                 ppos++;
900                 path[ppos].p_bh = bh;
901                 path[ppos].p_hdr = eh;
902         }
903
904         path[ppos].p_depth = i;
905         path[ppos].p_ext = NULL;
906         path[ppos].p_idx = NULL;
907
908         /* find extent */
909         ext4_ext_binsearch(inode, path + ppos, block);
910         /* if not an empty leaf */
911         if (path[ppos].p_ext)
912                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
913
914         ext4_ext_show_path(inode, path);
915
916         return path;
917
918 err:
919         ext4_ext_drop_refs(path);
920         kfree(path);
921         if (orig_path)
922                 *orig_path = NULL;
923         return ERR_PTR(ret);
924 }
925
926 /*
927  * ext4_ext_insert_index:
928  * insert new index [@logical;@ptr] into the block at @curp;
929  * check where to insert: before @curp or after @curp
930  */
931 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
932                                  struct ext4_ext_path *curp,
933                                  int logical, ext4_fsblk_t ptr)
934 {
935         struct ext4_extent_idx *ix;
936         int len, err;
937
938         err = ext4_ext_get_access(handle, inode, curp);
939         if (err)
940                 return err;
941
942         if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
943                 EXT4_ERROR_INODE(inode,
944                                  "logical %d == ei_block %d!",
945                                  logical, le32_to_cpu(curp->p_idx->ei_block));
946                 return -EFSCORRUPTED;
947         }
948
949         if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
950                              >= le16_to_cpu(curp->p_hdr->eh_max))) {
951                 EXT4_ERROR_INODE(inode,
952                                  "eh_entries %d >= eh_max %d!",
953                                  le16_to_cpu(curp->p_hdr->eh_entries),
954                                  le16_to_cpu(curp->p_hdr->eh_max));
955                 return -EFSCORRUPTED;
956         }
957
958         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
959                 /* insert after */
960                 ext_debug("insert new index %d after: %llu\n", logical, ptr);
961                 ix = curp->p_idx + 1;
962         } else {
963                 /* insert before */
964                 ext_debug("insert new index %d before: %llu\n", logical, ptr);
965                 ix = curp->p_idx;
966         }
967
968         len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
969         BUG_ON(len < 0);
970         if (len > 0) {
971                 ext_debug("insert new index %d: "
972                                 "move %d indices from 0x%p to 0x%p\n",
973                                 logical, len, ix, ix + 1);
974                 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
975         }
976
977         if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
978                 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
979                 return -EFSCORRUPTED;
980         }
981
982         ix->ei_block = cpu_to_le32(logical);
983         ext4_idx_store_pblock(ix, ptr);
984         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
985
986         if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
987                 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
988                 return -EFSCORRUPTED;
989         }
990
991         err = ext4_ext_dirty(handle, inode, curp);
992         ext4_std_error(inode->i_sb, err);
993
994         return err;
995 }
996
997 /*
998  * ext4_ext_split:
999  * inserts new subtree into the path, using free index entry
1000  * at depth @at:
1001  * - allocates all needed blocks (new leaf and all intermediate index blocks)
1002  * - makes decision where to split
1003  * - moves remaining extents and index entries (right to the split point)
1004  *   into the newly allocated blocks
1005  * - initializes subtree
1006  */
1007 static int ext4_ext_split(handle_t *handle, struct inode *inode,
1008                           unsigned int flags,
1009                           struct ext4_ext_path *path,
1010                           struct ext4_extent *newext, int at)
1011 {
1012         struct buffer_head *bh = NULL;
1013         int depth = ext_depth(inode);
1014         struct ext4_extent_header *neh;
1015         struct ext4_extent_idx *fidx;
1016         int i = at, k, m, a;
1017         ext4_fsblk_t newblock, oldblock;
1018         __le32 border;
1019         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1020         int err = 0;
1021         size_t ext_size = 0;
1022
1023         /* make decision: where to split? */
1024         /* FIXME: now decision is simplest: at current extent */
1025
1026         /* if current leaf will be split, then we should use
1027          * border from split point */
1028         if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1029                 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1030                 return -EFSCORRUPTED;
1031         }
1032         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1033                 border = path[depth].p_ext[1].ee_block;
1034                 ext_debug("leaf will be split."
1035                                 " next leaf starts at %d\n",
1036                                   le32_to_cpu(border));
1037         } else {
1038                 border = newext->ee_block;
1039                 ext_debug("leaf will be added."
1040                                 " next leaf starts at %d\n",
1041                                 le32_to_cpu(border));
1042         }
1043
1044         /*
1045          * If error occurs, then we break processing
1046          * and mark filesystem read-only. index won't
1047          * be inserted and tree will be in consistent
1048          * state. Next mount will repair buffers too.
1049          */
1050
1051         /*
1052          * Get array to track all allocated blocks.
1053          * We need this to handle errors and free blocks
1054          * upon them.
1055          */
1056         ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS);
1057         if (!ablocks)
1058                 return -ENOMEM;
1059
1060         /* allocate all needed blocks */
1061         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1062         for (a = 0; a < depth - at; a++) {
1063                 newblock = ext4_ext_new_meta_block(handle, inode, path,
1064                                                    newext, &err, flags);
1065                 if (newblock == 0)
1066                         goto cleanup;
1067                 ablocks[a] = newblock;
1068         }
1069
1070         /* initialize new leaf */
1071         newblock = ablocks[--a];
1072         if (unlikely(newblock == 0)) {
1073                 EXT4_ERROR_INODE(inode, "newblock == 0!");
1074                 err = -EFSCORRUPTED;
1075                 goto cleanup;
1076         }
1077         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1078         if (unlikely(!bh)) {
1079                 err = -ENOMEM;
1080                 goto cleanup;
1081         }
1082         lock_buffer(bh);
1083
1084         err = ext4_journal_get_create_access(handle, bh);
1085         if (err)
1086                 goto cleanup;
1087
1088         neh = ext_block_hdr(bh);
1089         neh->eh_entries = 0;
1090         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1091         neh->eh_magic = EXT4_EXT_MAGIC;
1092         neh->eh_depth = 0;
1093
1094         /* move remainder of path[depth] to the new leaf */
1095         if (unlikely(path[depth].p_hdr->eh_entries !=
1096                      path[depth].p_hdr->eh_max)) {
1097                 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1098                                  path[depth].p_hdr->eh_entries,
1099                                  path[depth].p_hdr->eh_max);
1100                 err = -EFSCORRUPTED;
1101                 goto cleanup;
1102         }
1103         /* start copy from next extent */
1104         m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1105         ext4_ext_show_move(inode, path, newblock, depth);
1106         if (m) {
1107                 struct ext4_extent *ex;
1108                 ex = EXT_FIRST_EXTENT(neh);
1109                 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1110                 le16_add_cpu(&neh->eh_entries, m);
1111         }
1112
1113         /* zero out unused area in the extent block */
1114         ext_size = sizeof(struct ext4_extent_header) +
1115                 sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1116         memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1117         ext4_extent_block_csum_set(inode, neh);
1118         set_buffer_uptodate(bh);
1119         unlock_buffer(bh);
1120
1121         err = ext4_handle_dirty_metadata(handle, inode, bh);
1122         if (err)
1123                 goto cleanup;
1124         brelse(bh);
1125         bh = NULL;
1126
1127         /* correct old leaf */
1128         if (m) {
1129                 err = ext4_ext_get_access(handle, inode, path + depth);
1130                 if (err)
1131                         goto cleanup;
1132                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1133                 err = ext4_ext_dirty(handle, inode, path + depth);
1134                 if (err)
1135                         goto cleanup;
1136
1137         }
1138
1139         /* create intermediate indexes */
1140         k = depth - at - 1;
1141         if (unlikely(k < 0)) {
1142                 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1143                 err = -EFSCORRUPTED;
1144                 goto cleanup;
1145         }
1146         if (k)
1147                 ext_debug("create %d intermediate indices\n", k);
1148         /* insert new index into current index block */
1149         /* current depth stored in i var */
1150         i = depth - 1;
1151         while (k--) {
1152                 oldblock = newblock;
1153                 newblock = ablocks[--a];
1154                 bh = sb_getblk(inode->i_sb, newblock);
1155                 if (unlikely(!bh)) {
1156                         err = -ENOMEM;
1157                         goto cleanup;
1158                 }
1159                 lock_buffer(bh);
1160
1161                 err = ext4_journal_get_create_access(handle, bh);
1162                 if (err)
1163                         goto cleanup;
1164
1165                 neh = ext_block_hdr(bh);
1166                 neh->eh_entries = cpu_to_le16(1);
1167                 neh->eh_magic = EXT4_EXT_MAGIC;
1168                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1169                 neh->eh_depth = cpu_to_le16(depth - i);
1170                 fidx = EXT_FIRST_INDEX(neh);
1171                 fidx->ei_block = border;
1172                 ext4_idx_store_pblock(fidx, oldblock);
1173
1174                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1175                                 i, newblock, le32_to_cpu(border), oldblock);
1176
1177                 /* move remainder of path[i] to the new index block */
1178                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1179                                         EXT_LAST_INDEX(path[i].p_hdr))) {
1180                         EXT4_ERROR_INODE(inode,
1181                                          "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1182                                          le32_to_cpu(path[i].p_ext->ee_block));
1183                         err = -EFSCORRUPTED;
1184                         goto cleanup;
1185                 }
1186                 /* start copy indexes */
1187                 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1188                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1189                                 EXT_MAX_INDEX(path[i].p_hdr));
1190                 ext4_ext_show_move(inode, path, newblock, i);
1191                 if (m) {
1192                         memmove(++fidx, path[i].p_idx,
1193                                 sizeof(struct ext4_extent_idx) * m);
1194                         le16_add_cpu(&neh->eh_entries, m);
1195                 }
1196                 /* zero out unused area in the extent block */
1197                 ext_size = sizeof(struct ext4_extent_header) +
1198                    (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1199                 memset(bh->b_data + ext_size, 0,
1200                         inode->i_sb->s_blocksize - ext_size);
1201                 ext4_extent_block_csum_set(inode, neh);
1202                 set_buffer_uptodate(bh);
1203                 unlock_buffer(bh);
1204
1205                 err = ext4_handle_dirty_metadata(handle, inode, bh);
1206                 if (err)
1207                         goto cleanup;
1208                 brelse(bh);
1209                 bh = NULL;
1210
1211                 /* correct old index */
1212                 if (m) {
1213                         err = ext4_ext_get_access(handle, inode, path + i);
1214                         if (err)
1215                                 goto cleanup;
1216                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1217                         err = ext4_ext_dirty(handle, inode, path + i);
1218                         if (err)
1219                                 goto cleanup;
1220                 }
1221
1222                 i--;
1223         }
1224
1225         /* insert new index */
1226         err = ext4_ext_insert_index(handle, inode, path + at,
1227                                     le32_to_cpu(border), newblock);
1228
1229 cleanup:
1230         if (bh) {
1231                 if (buffer_locked(bh))
1232                         unlock_buffer(bh);
1233                 brelse(bh);
1234         }
1235
1236         if (err) {
1237                 /* free all allocated blocks in error case */
1238                 for (i = 0; i < depth; i++) {
1239                         if (!ablocks[i])
1240                                 continue;
1241                         ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1242                                          EXT4_FREE_BLOCKS_METADATA);
1243                 }
1244         }
1245         kfree(ablocks);
1246
1247         return err;
1248 }
1249
1250 /*
1251  * ext4_ext_grow_indepth:
1252  * implements tree growing procedure:
1253  * - allocates new block
1254  * - moves top-level data (index block or leaf) into the new block
1255  * - initializes new top-level, creating index that points to the
1256  *   just created block
1257  */
1258 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1259                                  unsigned int flags)
1260 {
1261         struct ext4_extent_header *neh;
1262         struct buffer_head *bh;
1263         ext4_fsblk_t newblock, goal = 0;
1264         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1265         int err = 0;
1266         size_t ext_size = 0;
1267
1268         /* Try to prepend new index to old one */
1269         if (ext_depth(inode))
1270                 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1271         if (goal > le32_to_cpu(es->s_first_data_block)) {
1272                 flags |= EXT4_MB_HINT_TRY_GOAL;
1273                 goal--;
1274         } else
1275                 goal = ext4_inode_to_goal_block(inode);
1276         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1277                                         NULL, &err);
1278         if (newblock == 0)
1279                 return err;
1280
1281         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1282         if (unlikely(!bh))
1283                 return -ENOMEM;
1284         lock_buffer(bh);
1285
1286         err = ext4_journal_get_create_access(handle, bh);
1287         if (err) {
1288                 unlock_buffer(bh);
1289                 goto out;
1290         }
1291
1292         ext_size = sizeof(EXT4_I(inode)->i_data);
1293         /* move top-level index/leaf into new block */
1294         memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1295         /* zero out unused area in the extent block */
1296         memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1297
1298         /* set size of new block */
1299         neh = ext_block_hdr(bh);
1300         /* old root could have indexes or leaves
1301          * so calculate e_max right way */
1302         if (ext_depth(inode))
1303                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1304         else
1305                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1306         neh->eh_magic = EXT4_EXT_MAGIC;
1307         ext4_extent_block_csum_set(inode, neh);
1308         set_buffer_uptodate(bh);
1309         unlock_buffer(bh);
1310
1311         err = ext4_handle_dirty_metadata(handle, inode, bh);
1312         if (err)
1313                 goto out;
1314
1315         /* Update top-level index: num,max,pointer */
1316         neh = ext_inode_hdr(inode);
1317         neh->eh_entries = cpu_to_le16(1);
1318         ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1319         if (neh->eh_depth == 0) {
1320                 /* Root extent block becomes index block */
1321                 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1322                 EXT_FIRST_INDEX(neh)->ei_block =
1323                         EXT_FIRST_EXTENT(neh)->ee_block;
1324         }
1325         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1326                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1327                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1328                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1329
1330         le16_add_cpu(&neh->eh_depth, 1);
1331         ext4_mark_inode_dirty(handle, inode);
1332 out:
1333         brelse(bh);
1334
1335         return err;
1336 }
1337
1338 /*
1339  * ext4_ext_create_new_leaf:
1340  * finds empty index and adds new leaf.
1341  * if no free index is found, then it requests in-depth growing.
1342  */
1343 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1344                                     unsigned int mb_flags,
1345                                     unsigned int gb_flags,
1346                                     struct ext4_ext_path **ppath,
1347                                     struct ext4_extent *newext)
1348 {
1349         struct ext4_ext_path *path = *ppath;
1350         struct ext4_ext_path *curp;
1351         int depth, i, err = 0;
1352
1353 repeat:
1354         i = depth = ext_depth(inode);
1355
1356         /* walk up to the tree and look for free index entry */
1357         curp = path + depth;
1358         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1359                 i--;
1360                 curp--;
1361         }
1362
1363         /* we use already allocated block for index block,
1364          * so subsequent data blocks should be contiguous */
1365         if (EXT_HAS_FREE_INDEX(curp)) {
1366                 /* if we found index with free entry, then use that
1367                  * entry: create all needed subtree and add new leaf */
1368                 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1369                 if (err)
1370                         goto out;
1371
1372                 /* refill path */
1373                 path = ext4_find_extent(inode,
1374                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1375                                     ppath, gb_flags);
1376                 if (IS_ERR(path))
1377                         err = PTR_ERR(path);
1378         } else {
1379                 /* tree is full, time to grow in depth */
1380                 err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1381                 if (err)
1382                         goto out;
1383
1384                 /* refill path */
1385                 path = ext4_find_extent(inode,
1386                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1387                                     ppath, gb_flags);
1388                 if (IS_ERR(path)) {
1389                         err = PTR_ERR(path);
1390                         goto out;
1391                 }
1392
1393                 /*
1394                  * only first (depth 0 -> 1) produces free space;
1395                  * in all other cases we have to split the grown tree
1396                  */
1397                 depth = ext_depth(inode);
1398                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1399                         /* now we need to split */
1400                         goto repeat;
1401                 }
1402         }
1403
1404 out:
1405         return err;
1406 }
1407
1408 /*
1409  * search the closest allocated block to the left for *logical
1410  * and returns it at @logical + it's physical address at @phys
1411  * if *logical is the smallest allocated block, the function
1412  * returns 0 at @phys
1413  * return value contains 0 (success) or error code
1414  */
1415 static int ext4_ext_search_left(struct inode *inode,
1416                                 struct ext4_ext_path *path,
1417                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1418 {
1419         struct ext4_extent_idx *ix;
1420         struct ext4_extent *ex;
1421         int depth, ee_len;
1422
1423         if (unlikely(path == NULL)) {
1424                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1425                 return -EFSCORRUPTED;
1426         }
1427         depth = path->p_depth;
1428         *phys = 0;
1429
1430         if (depth == 0 && path->p_ext == NULL)
1431                 return 0;
1432
1433         /* usually extent in the path covers blocks smaller
1434          * then *logical, but it can be that extent is the
1435          * first one in the file */
1436
1437         ex = path[depth].p_ext;
1438         ee_len = ext4_ext_get_actual_len(ex);
1439         if (*logical < le32_to_cpu(ex->ee_block)) {
1440                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1441                         EXT4_ERROR_INODE(inode,
1442                                          "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1443                                          *logical, le32_to_cpu(ex->ee_block));
1444                         return -EFSCORRUPTED;
1445                 }
1446                 while (--depth >= 0) {
1447                         ix = path[depth].p_idx;
1448                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1449                                 EXT4_ERROR_INODE(inode,
1450                                   "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1451                                   ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1452                                   EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1453                 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1454                                   depth);
1455                                 return -EFSCORRUPTED;
1456                         }
1457                 }
1458                 return 0;
1459         }
1460
1461         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1462                 EXT4_ERROR_INODE(inode,
1463                                  "logical %d < ee_block %d + ee_len %d!",
1464                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1465                 return -EFSCORRUPTED;
1466         }
1467
1468         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1469         *phys = ext4_ext_pblock(ex) + ee_len - 1;
1470         return 0;
1471 }
1472
1473 /*
1474  * search the closest allocated block to the right for *logical
1475  * and returns it at @logical + it's physical address at @phys
1476  * if *logical is the largest allocated block, the function
1477  * returns 0 at @phys
1478  * return value contains 0 (success) or error code
1479  */
1480 static int ext4_ext_search_right(struct inode *inode,
1481                                  struct ext4_ext_path *path,
1482                                  ext4_lblk_t *logical, ext4_fsblk_t *phys,
1483                                  struct ext4_extent **ret_ex)
1484 {
1485         struct buffer_head *bh = NULL;
1486         struct ext4_extent_header *eh;
1487         struct ext4_extent_idx *ix;
1488         struct ext4_extent *ex;
1489         ext4_fsblk_t block;
1490         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1491         int ee_len;
1492
1493         if (unlikely(path == NULL)) {
1494                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1495                 return -EFSCORRUPTED;
1496         }
1497         depth = path->p_depth;
1498         *phys = 0;
1499
1500         if (depth == 0 && path->p_ext == NULL)
1501                 return 0;
1502
1503         /* usually extent in the path covers blocks smaller
1504          * then *logical, but it can be that extent is the
1505          * first one in the file */
1506
1507         ex = path[depth].p_ext;
1508         ee_len = ext4_ext_get_actual_len(ex);
1509         if (*logical < le32_to_cpu(ex->ee_block)) {
1510                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1511                         EXT4_ERROR_INODE(inode,
1512                                          "first_extent(path[%d].p_hdr) != ex",
1513                                          depth);
1514                         return -EFSCORRUPTED;
1515                 }
1516                 while (--depth >= 0) {
1517                         ix = path[depth].p_idx;
1518                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1519                                 EXT4_ERROR_INODE(inode,
1520                                                  "ix != EXT_FIRST_INDEX *logical %d!",
1521                                                  *logical);
1522                                 return -EFSCORRUPTED;
1523                         }
1524                 }
1525                 goto found_extent;
1526         }
1527
1528         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1529                 EXT4_ERROR_INODE(inode,
1530                                  "logical %d < ee_block %d + ee_len %d!",
1531                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1532                 return -EFSCORRUPTED;
1533         }
1534
1535         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1536                 /* next allocated block in this leaf */
1537                 ex++;
1538                 goto found_extent;
1539         }
1540
1541         /* go up and search for index to the right */
1542         while (--depth >= 0) {
1543                 ix = path[depth].p_idx;
1544                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1545                         goto got_index;
1546         }
1547
1548         /* we've gone up to the root and found no index to the right */
1549         return 0;
1550
1551 got_index:
1552         /* we've found index to the right, let's
1553          * follow it and find the closest allocated
1554          * block to the right */
1555         ix++;
1556         block = ext4_idx_pblock(ix);
1557         while (++depth < path->p_depth) {
1558                 /* subtract from p_depth to get proper eh_depth */
1559                 bh = read_extent_tree_block(inode, block,
1560                                             path->p_depth - depth, 0);
1561                 if (IS_ERR(bh))
1562                         return PTR_ERR(bh);
1563                 eh = ext_block_hdr(bh);
1564                 ix = EXT_FIRST_INDEX(eh);
1565                 block = ext4_idx_pblock(ix);
1566                 put_bh(bh);
1567         }
1568
1569         bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1570         if (IS_ERR(bh))
1571                 return PTR_ERR(bh);
1572         eh = ext_block_hdr(bh);
1573         ex = EXT_FIRST_EXTENT(eh);
1574 found_extent:
1575         *logical = le32_to_cpu(ex->ee_block);
1576         *phys = ext4_ext_pblock(ex);
1577         *ret_ex = ex;
1578         if (bh)
1579                 put_bh(bh);
1580         return 0;
1581 }
1582
1583 /*
1584  * ext4_ext_next_allocated_block:
1585  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1586  * NOTE: it considers block number from index entry as
1587  * allocated block. Thus, index entries have to be consistent
1588  * with leaves.
1589  */
1590 ext4_lblk_t
1591 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1592 {
1593         int depth;
1594
1595         BUG_ON(path == NULL);
1596         depth = path->p_depth;
1597
1598         if (depth == 0 && path->p_ext == NULL)
1599                 return EXT_MAX_BLOCKS;
1600
1601         while (depth >= 0) {
1602                 struct ext4_ext_path *p = &path[depth];
1603
1604                 if (depth == path->p_depth) {
1605                         /* leaf */
1606                         if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1607                                 return le32_to_cpu(p->p_ext[1].ee_block);
1608                 } else {
1609                         /* index */
1610                         if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1611                                 return le32_to_cpu(p->p_idx[1].ei_block);
1612                 }
1613                 depth--;
1614         }
1615
1616         return EXT_MAX_BLOCKS;
1617 }
1618
1619 /*
1620  * ext4_ext_next_leaf_block:
1621  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1622  */
1623 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1624 {
1625         int depth;
1626
1627         BUG_ON(path == NULL);
1628         depth = path->p_depth;
1629
1630         /* zero-tree has no leaf blocks at all */
1631         if (depth == 0)
1632                 return EXT_MAX_BLOCKS;
1633
1634         /* go to index block */
1635         depth--;
1636
1637         while (depth >= 0) {
1638                 if (path[depth].p_idx !=
1639                                 EXT_LAST_INDEX(path[depth].p_hdr))
1640                         return (ext4_lblk_t)
1641                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1642                 depth--;
1643         }
1644
1645         return EXT_MAX_BLOCKS;
1646 }
1647
1648 /*
1649  * ext4_ext_correct_indexes:
1650  * if leaf gets modified and modified extent is first in the leaf,
1651  * then we have to correct all indexes above.
1652  * TODO: do we need to correct tree in all cases?
1653  */
1654 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1655                                 struct ext4_ext_path *path)
1656 {
1657         struct ext4_extent_header *eh;
1658         int depth = ext_depth(inode);
1659         struct ext4_extent *ex;
1660         __le32 border;
1661         int k, err = 0;
1662
1663         eh = path[depth].p_hdr;
1664         ex = path[depth].p_ext;
1665
1666         if (unlikely(ex == NULL || eh == NULL)) {
1667                 EXT4_ERROR_INODE(inode,
1668                                  "ex %p == NULL or eh %p == NULL", ex, eh);
1669                 return -EFSCORRUPTED;
1670         }
1671
1672         if (depth == 0) {
1673                 /* there is no tree at all */
1674                 return 0;
1675         }
1676
1677         if (ex != EXT_FIRST_EXTENT(eh)) {
1678                 /* we correct tree if first leaf got modified only */
1679                 return 0;
1680         }
1681
1682         /*
1683          * TODO: we need correction if border is smaller than current one
1684          */
1685         k = depth - 1;
1686         border = path[depth].p_ext->ee_block;
1687         err = ext4_ext_get_access(handle, inode, path + k);
1688         if (err)
1689                 return err;
1690         path[k].p_idx->ei_block = border;
1691         err = ext4_ext_dirty(handle, inode, path + k);
1692         if (err)
1693                 return err;
1694
1695         while (k--) {
1696                 /* change all left-side indexes */
1697                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1698                         break;
1699                 err = ext4_ext_get_access(handle, inode, path + k);
1700                 if (err)
1701                         break;
1702                 path[k].p_idx->ei_block = border;
1703                 err = ext4_ext_dirty(handle, inode, path + k);
1704                 if (err)
1705                         break;
1706         }
1707
1708         return err;
1709 }
1710
1711 static int ext4_can_extents_be_merged(struct inode *inode,
1712                                       struct ext4_extent *ex1,
1713                                       struct ext4_extent *ex2)
1714 {
1715         unsigned short ext1_ee_len, ext2_ee_len;
1716
1717         if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1718                 return 0;
1719
1720         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1721         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1722
1723         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1724                         le32_to_cpu(ex2->ee_block))
1725                 return 0;
1726
1727         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1728                 return 0;
1729
1730         if (ext4_ext_is_unwritten(ex1) &&
1731             ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1732                 return 0;
1733 #ifdef AGGRESSIVE_TEST
1734         if (ext1_ee_len >= 4)
1735                 return 0;
1736 #endif
1737
1738         if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1739                 return 1;
1740         return 0;
1741 }
1742
1743 /*
1744  * This function tries to merge the "ex" extent to the next extent in the tree.
1745  * It always tries to merge towards right. If you want to merge towards
1746  * left, pass "ex - 1" as argument instead of "ex".
1747  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1748  * 1 if they got merged.
1749  */
1750 static int ext4_ext_try_to_merge_right(struct inode *inode,
1751                                  struct ext4_ext_path *path,
1752                                  struct ext4_extent *ex)
1753 {
1754         struct ext4_extent_header *eh;
1755         unsigned int depth, len;
1756         int merge_done = 0, unwritten;
1757
1758         depth = ext_depth(inode);
1759         BUG_ON(path[depth].p_hdr == NULL);
1760         eh = path[depth].p_hdr;
1761
1762         while (ex < EXT_LAST_EXTENT(eh)) {
1763                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1764                         break;
1765                 /* merge with next extent! */
1766                 unwritten = ext4_ext_is_unwritten(ex);
1767                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1768                                 + ext4_ext_get_actual_len(ex + 1));
1769                 if (unwritten)
1770                         ext4_ext_mark_unwritten(ex);
1771
1772                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1773                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1774                                 * sizeof(struct ext4_extent);
1775                         memmove(ex + 1, ex + 2, len);
1776                 }
1777                 le16_add_cpu(&eh->eh_entries, -1);
1778                 merge_done = 1;
1779                 WARN_ON(eh->eh_entries == 0);
1780                 if (!eh->eh_entries)
1781                         EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1782         }
1783
1784         return merge_done;
1785 }
1786
1787 /*
1788  * This function does a very simple check to see if we can collapse
1789  * an extent tree with a single extent tree leaf block into the inode.
1790  */
1791 static void ext4_ext_try_to_merge_up(handle_t *handle,
1792                                      struct inode *inode,
1793                                      struct ext4_ext_path *path)
1794 {
1795         size_t s;
1796         unsigned max_root = ext4_ext_space_root(inode, 0);
1797         ext4_fsblk_t blk;
1798
1799         if ((path[0].p_depth != 1) ||
1800             (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1801             (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1802                 return;
1803
1804         /*
1805          * We need to modify the block allocation bitmap and the block
1806          * group descriptor to release the extent tree block.  If we
1807          * can't get the journal credits, give up.
1808          */
1809         if (ext4_journal_extend(handle, 2,
1810                         ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1811                 return;
1812
1813         /*
1814          * Copy the extent data up to the inode
1815          */
1816         blk = ext4_idx_pblock(path[0].p_idx);
1817         s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1818                 sizeof(struct ext4_extent_idx);
1819         s += sizeof(struct ext4_extent_header);
1820
1821         path[1].p_maxdepth = path[0].p_maxdepth;
1822         memcpy(path[0].p_hdr, path[1].p_hdr, s);
1823         path[0].p_depth = 0;
1824         path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1825                 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1826         path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1827
1828         brelse(path[1].p_bh);
1829         ext4_free_blocks(handle, inode, NULL, blk, 1,
1830                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1831 }
1832
1833 /*
1834  * This function tries to merge the @ex extent to neighbours in the tree, then
1835  * tries to collapse the extent tree into the inode.
1836  */
1837 static void ext4_ext_try_to_merge(handle_t *handle,
1838                                   struct inode *inode,
1839                                   struct ext4_ext_path *path,
1840                                   struct ext4_extent *ex)
1841 {
1842         struct ext4_extent_header *eh;
1843         unsigned int depth;
1844         int merge_done = 0;
1845
1846         depth = ext_depth(inode);
1847         BUG_ON(path[depth].p_hdr == NULL);
1848         eh = path[depth].p_hdr;
1849
1850         if (ex > EXT_FIRST_EXTENT(eh))
1851                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1852
1853         if (!merge_done)
1854                 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1855
1856         ext4_ext_try_to_merge_up(handle, inode, path);
1857 }
1858
1859 /*
1860  * check if a portion of the "newext" extent overlaps with an
1861  * existing extent.
1862  *
1863  * If there is an overlap discovered, it updates the length of the newext
1864  * such that there will be no overlap, and then returns 1.
1865  * If there is no overlap found, it returns 0.
1866  */
1867 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1868                                            struct inode *inode,
1869                                            struct ext4_extent *newext,
1870                                            struct ext4_ext_path *path)
1871 {
1872         ext4_lblk_t b1, b2;
1873         unsigned int depth, len1;
1874         unsigned int ret = 0;
1875
1876         b1 = le32_to_cpu(newext->ee_block);
1877         len1 = ext4_ext_get_actual_len(newext);
1878         depth = ext_depth(inode);
1879         if (!path[depth].p_ext)
1880                 goto out;
1881         b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1882
1883         /*
1884          * get the next allocated block if the extent in the path
1885          * is before the requested block(s)
1886          */
1887         if (b2 < b1) {
1888                 b2 = ext4_ext_next_allocated_block(path);
1889                 if (b2 == EXT_MAX_BLOCKS)
1890                         goto out;
1891                 b2 = EXT4_LBLK_CMASK(sbi, b2);
1892         }
1893
1894         /* check for wrap through zero on extent logical start block*/
1895         if (b1 + len1 < b1) {
1896                 len1 = EXT_MAX_BLOCKS - b1;
1897                 newext->ee_len = cpu_to_le16(len1);
1898                 ret = 1;
1899         }
1900
1901         /* check for overlap */
1902         if (b1 + len1 > b2) {
1903                 newext->ee_len = cpu_to_le16(b2 - b1);
1904                 ret = 1;
1905         }
1906 out:
1907         return ret;
1908 }
1909
1910 /*
1911  * ext4_ext_insert_extent:
1912  * tries to merge requsted extent into the existing extent or
1913  * inserts requested extent as new one into the tree,
1914  * creating new leaf in the no-space case.
1915  */
1916 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1917                                 struct ext4_ext_path **ppath,
1918                                 struct ext4_extent *newext, int gb_flags)
1919 {
1920         struct ext4_ext_path *path = *ppath;
1921         struct ext4_extent_header *eh;
1922         struct ext4_extent *ex, *fex;
1923         struct ext4_extent *nearex; /* nearest extent */
1924         struct ext4_ext_path *npath = NULL;
1925         int depth, len, err;
1926         ext4_lblk_t next;
1927         int mb_flags = 0, unwritten;
1928
1929         if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1930                 mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1931         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1932                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1933                 return -EFSCORRUPTED;
1934         }
1935         depth = ext_depth(inode);
1936         ex = path[depth].p_ext;
1937         eh = path[depth].p_hdr;
1938         if (unlikely(path[depth].p_hdr == NULL)) {
1939                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1940                 return -EFSCORRUPTED;
1941         }
1942
1943         /* try to insert block into found extent and return */
1944         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1945
1946                 /*
1947                  * Try to see whether we should rather test the extent on
1948                  * right from ex, or from the left of ex. This is because
1949                  * ext4_find_extent() can return either extent on the
1950                  * left, or on the right from the searched position. This
1951                  * will make merging more effective.
1952                  */
1953                 if (ex < EXT_LAST_EXTENT(eh) &&
1954                     (le32_to_cpu(ex->ee_block) +
1955                     ext4_ext_get_actual_len(ex) <
1956                     le32_to_cpu(newext->ee_block))) {
1957                         ex += 1;
1958                         goto prepend;
1959                 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1960                            (le32_to_cpu(newext->ee_block) +
1961                            ext4_ext_get_actual_len(newext) <
1962                            le32_to_cpu(ex->ee_block)))
1963                         ex -= 1;
1964
1965                 /* Try to append newex to the ex */
1966                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1967                         ext_debug("append [%d]%d block to %u:[%d]%d"
1968                                   "(from %llu)\n",
1969                                   ext4_ext_is_unwritten(newext),
1970                                   ext4_ext_get_actual_len(newext),
1971                                   le32_to_cpu(ex->ee_block),
1972                                   ext4_ext_is_unwritten(ex),
1973                                   ext4_ext_get_actual_len(ex),
1974                                   ext4_ext_pblock(ex));
1975                         err = ext4_ext_get_access(handle, inode,
1976                                                   path + depth);
1977                         if (err)
1978                                 return err;
1979                         unwritten = ext4_ext_is_unwritten(ex);
1980                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1981                                         + ext4_ext_get_actual_len(newext));
1982                         if (unwritten)
1983                                 ext4_ext_mark_unwritten(ex);
1984                         eh = path[depth].p_hdr;
1985                         nearex = ex;
1986                         goto merge;
1987                 }
1988
1989 prepend:
1990                 /* Try to prepend newex to the ex */
1991                 if (ext4_can_extents_be_merged(inode, newext, ex)) {
1992                         ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1993                                   "(from %llu)\n",
1994                                   le32_to_cpu(newext->ee_block),
1995                                   ext4_ext_is_unwritten(newext),
1996                                   ext4_ext_get_actual_len(newext),
1997                                   le32_to_cpu(ex->ee_block),
1998                                   ext4_ext_is_unwritten(ex),
1999                                   ext4_ext_get_actual_len(ex),
2000                                   ext4_ext_pblock(ex));
2001                         err = ext4_ext_get_access(handle, inode,
2002                                                   path + depth);
2003                         if (err)
2004                                 return err;
2005
2006                         unwritten = ext4_ext_is_unwritten(ex);
2007                         ex->ee_block = newext->ee_block;
2008                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2009                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2010                                         + ext4_ext_get_actual_len(newext));
2011                         if (unwritten)
2012                                 ext4_ext_mark_unwritten(ex);
2013                         eh = path[depth].p_hdr;
2014                         nearex = ex;
2015                         goto merge;
2016                 }
2017         }
2018
2019         depth = ext_depth(inode);
2020         eh = path[depth].p_hdr;
2021         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2022                 goto has_space;
2023
2024         /* probably next leaf has space for us? */
2025         fex = EXT_LAST_EXTENT(eh);
2026         next = EXT_MAX_BLOCKS;
2027         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2028                 next = ext4_ext_next_leaf_block(path);
2029         if (next != EXT_MAX_BLOCKS) {
2030                 ext_debug("next leaf block - %u\n", next);
2031                 BUG_ON(npath != NULL);
2032                 npath = ext4_find_extent(inode, next, NULL, 0);
2033                 if (IS_ERR(npath))
2034                         return PTR_ERR(npath);
2035                 BUG_ON(npath->p_depth != path->p_depth);
2036                 eh = npath[depth].p_hdr;
2037                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2038                         ext_debug("next leaf isn't full(%d)\n",
2039                                   le16_to_cpu(eh->eh_entries));
2040                         path = npath;
2041                         goto has_space;
2042                 }
2043                 ext_debug("next leaf has no free space(%d,%d)\n",
2044                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2045         }
2046
2047         /*
2048          * There is no free space in the found leaf.
2049          * We're gonna add a new leaf in the tree.
2050          */
2051         if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2052                 mb_flags |= EXT4_MB_USE_RESERVED;
2053         err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2054                                        ppath, newext);
2055         if (err)
2056                 goto cleanup;
2057         depth = ext_depth(inode);
2058         eh = path[depth].p_hdr;
2059
2060 has_space:
2061         nearex = path[depth].p_ext;
2062
2063         err = ext4_ext_get_access(handle, inode, path + depth);
2064         if (err)
2065                 goto cleanup;
2066
2067         if (!nearex) {
2068                 /* there is no extent in this leaf, create first one */
2069                 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2070                                 le32_to_cpu(newext->ee_block),
2071                                 ext4_ext_pblock(newext),
2072                                 ext4_ext_is_unwritten(newext),
2073                                 ext4_ext_get_actual_len(newext));
2074                 nearex = EXT_FIRST_EXTENT(eh);
2075         } else {
2076                 if (le32_to_cpu(newext->ee_block)
2077                            > le32_to_cpu(nearex->ee_block)) {
2078                         /* Insert after */
2079                         ext_debug("insert %u:%llu:[%d]%d before: "
2080                                         "nearest %p\n",
2081                                         le32_to_cpu(newext->ee_block),
2082                                         ext4_ext_pblock(newext),
2083                                         ext4_ext_is_unwritten(newext),
2084                                         ext4_ext_get_actual_len(newext),
2085                                         nearex);
2086                         nearex++;
2087                 } else {
2088                         /* Insert before */
2089                         BUG_ON(newext->ee_block == nearex->ee_block);
2090                         ext_debug("insert %u:%llu:[%d]%d after: "
2091                                         "nearest %p\n",
2092                                         le32_to_cpu(newext->ee_block),
2093                                         ext4_ext_pblock(newext),
2094                                         ext4_ext_is_unwritten(newext),
2095                                         ext4_ext_get_actual_len(newext),
2096                                         nearex);
2097                 }
2098                 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2099                 if (len > 0) {
2100                         ext_debug("insert %u:%llu:[%d]%d: "
2101                                         "move %d extents from 0x%p to 0x%p\n",
2102                                         le32_to_cpu(newext->ee_block),
2103                                         ext4_ext_pblock(newext),
2104                                         ext4_ext_is_unwritten(newext),
2105                                         ext4_ext_get_actual_len(newext),
2106                                         len, nearex, nearex + 1);
2107                         memmove(nearex + 1, nearex,
2108                                 len * sizeof(struct ext4_extent));
2109                 }
2110         }
2111
2112         le16_add_cpu(&eh->eh_entries, 1);
2113         path[depth].p_ext = nearex;
2114         nearex->ee_block = newext->ee_block;
2115         ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2116         nearex->ee_len = newext->ee_len;
2117
2118 merge:
2119         /* try to merge extents */
2120         if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2121                 ext4_ext_try_to_merge(handle, inode, path, nearex);
2122
2123
2124         /* time to correct all indexes above */
2125         err = ext4_ext_correct_indexes(handle, inode, path);
2126         if (err)
2127                 goto cleanup;
2128
2129         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2130
2131 cleanup:
2132         ext4_ext_drop_refs(npath);
2133         kfree(npath);
2134         return err;
2135 }
2136
2137 static int ext4_fill_fiemap_extents(struct inode *inode,
2138                                     ext4_lblk_t block, ext4_lblk_t num,
2139                                     struct fiemap_extent_info *fieinfo)
2140 {
2141         struct ext4_ext_path *path = NULL;
2142         struct ext4_extent *ex;
2143         struct extent_status es;
2144         ext4_lblk_t next, next_del, start = 0, end = 0;
2145         ext4_lblk_t last = block + num;
2146         int exists, depth = 0, err = 0;
2147         unsigned int flags = 0;
2148         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2149
2150         while (block < last && block != EXT_MAX_BLOCKS) {
2151                 num = last - block;
2152                 /* find extent for this block */
2153                 down_read(&EXT4_I(inode)->i_data_sem);
2154
2155                 path = ext4_find_extent(inode, block, &path, 0);
2156                 if (IS_ERR(path)) {
2157                         up_read(&EXT4_I(inode)->i_data_sem);
2158                         err = PTR_ERR(path);
2159                         path = NULL;
2160                         break;
2161                 }
2162
2163                 depth = ext_depth(inode);
2164                 if (unlikely(path[depth].p_hdr == NULL)) {
2165                         up_read(&EXT4_I(inode)->i_data_sem);
2166                         EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2167                         err = -EFSCORRUPTED;
2168                         break;
2169                 }
2170                 ex = path[depth].p_ext;
2171                 next = ext4_ext_next_allocated_block(path);
2172
2173                 flags = 0;
2174                 exists = 0;
2175                 if (!ex) {
2176                         /* there is no extent yet, so try to allocate
2177                          * all requested space */
2178                         start = block;
2179                         end = block + num;
2180                 } else if (le32_to_cpu(ex->ee_block) > block) {
2181                         /* need to allocate space before found extent */
2182                         start = block;
2183                         end = le32_to_cpu(ex->ee_block);
2184                         if (block + num < end)
2185                                 end = block + num;
2186                 } else if (block >= le32_to_cpu(ex->ee_block)
2187                                         + ext4_ext_get_actual_len(ex)) {
2188                         /* need to allocate space after found extent */
2189                         start = block;
2190                         end = block + num;
2191                         if (end >= next)
2192                                 end = next;
2193                 } else if (block >= le32_to_cpu(ex->ee_block)) {
2194                         /*
2195                          * some part of requested space is covered
2196                          * by found extent
2197                          */
2198                         start = block;
2199                         end = le32_to_cpu(ex->ee_block)
2200                                 + ext4_ext_get_actual_len(ex);
2201                         if (block + num < end)
2202                                 end = block + num;
2203                         exists = 1;
2204                 } else {
2205                         BUG();
2206                 }
2207                 BUG_ON(end <= start);
2208
2209                 if (!exists) {
2210                         es.es_lblk = start;
2211                         es.es_len = end - start;
2212                         es.es_pblk = 0;
2213                 } else {
2214                         es.es_lblk = le32_to_cpu(ex->ee_block);
2215                         es.es_len = ext4_ext_get_actual_len(ex);
2216                         es.es_pblk = ext4_ext_pblock(ex);
2217                         if (ext4_ext_is_unwritten(ex))
2218                                 flags |= FIEMAP_EXTENT_UNWRITTEN;
2219                 }
2220
2221                 /*
2222                  * Find delayed extent and update es accordingly. We call
2223                  * it even in !exists case to find out whether es is the
2224                  * last existing extent or not.
2225                  */
2226                 next_del = ext4_find_delayed_extent(inode, &es);
2227                 if (!exists && next_del) {
2228                         exists = 1;
2229                         flags |= (FIEMAP_EXTENT_DELALLOC |
2230                                   FIEMAP_EXTENT_UNKNOWN);
2231                 }
2232                 up_read(&EXT4_I(inode)->i_data_sem);
2233
2234                 if (unlikely(es.es_len == 0)) {
2235                         EXT4_ERROR_INODE(inode, "es.es_len == 0");
2236                         err = -EFSCORRUPTED;
2237                         break;
2238                 }
2239
2240                 /*
2241                  * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2242                  * we need to check next == EXT_MAX_BLOCKS because it is
2243                  * possible that an extent is with unwritten and delayed
2244                  * status due to when an extent is delayed allocated and
2245                  * is allocated by fallocate status tree will track both of
2246                  * them in a extent.
2247                  *
2248                  * So we could return a unwritten and delayed extent, and
2249                  * its block is equal to 'next'.
2250                  */
2251                 if (next == next_del && next == EXT_MAX_BLOCKS) {
2252                         flags |= FIEMAP_EXTENT_LAST;
2253                         if (unlikely(next_del != EXT_MAX_BLOCKS ||
2254                                      next != EXT_MAX_BLOCKS)) {
2255                                 EXT4_ERROR_INODE(inode,
2256                                                  "next extent == %u, next "
2257                                                  "delalloc extent = %u",
2258                                                  next, next_del);
2259                                 err = -EFSCORRUPTED;
2260                                 break;
2261                         }
2262                 }
2263
2264                 if (exists) {
2265                         err = fiemap_fill_next_extent(fieinfo,
2266                                 (__u64)es.es_lblk << blksize_bits,
2267                                 (__u64)es.es_pblk << blksize_bits,
2268                                 (__u64)es.es_len << blksize_bits,
2269                                 flags);
2270                         if (err < 0)
2271                                 break;
2272                         if (err == 1) {
2273                                 err = 0;
2274                                 break;
2275                         }
2276                 }
2277
2278                 block = es.es_lblk + es.es_len;
2279         }
2280
2281         ext4_ext_drop_refs(path);
2282         kfree(path);
2283         return err;
2284 }
2285
2286 static int ext4_fill_es_cache_info(struct inode *inode,
2287                                    ext4_lblk_t block, ext4_lblk_t num,
2288                                    struct fiemap_extent_info *fieinfo)
2289 {
2290         ext4_lblk_t next, end = block + num - 1;
2291         struct extent_status es;
2292         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2293         unsigned int flags;
2294         int err;
2295
2296         while (block <= end) {
2297                 next = 0;
2298                 flags = 0;
2299                 if (!ext4_es_lookup_extent(inode, block, &next, &es))
2300                         break;
2301                 if (ext4_es_is_unwritten(&es))
2302                         flags |= FIEMAP_EXTENT_UNWRITTEN;
2303                 if (ext4_es_is_delayed(&es))
2304                         flags |= (FIEMAP_EXTENT_DELALLOC |
2305                                   FIEMAP_EXTENT_UNKNOWN);
2306                 if (ext4_es_is_hole(&es))
2307                         flags |= EXT4_FIEMAP_EXTENT_HOLE;
2308                 if (next == 0)
2309                         flags |= FIEMAP_EXTENT_LAST;
2310                 if (flags & (FIEMAP_EXTENT_DELALLOC|
2311                              EXT4_FIEMAP_EXTENT_HOLE))
2312                         es.es_pblk = 0;
2313                 else
2314                         es.es_pblk = ext4_es_pblock(&es);
2315                 err = fiemap_fill_next_extent(fieinfo,
2316                                 (__u64)es.es_lblk << blksize_bits,
2317                                 (__u64)es.es_pblk << blksize_bits,
2318                                 (__u64)es.es_len << blksize_bits,
2319                                 flags);
2320                 if (next == 0)
2321                         break;
2322                 block = next;
2323                 if (err < 0)
2324                         return err;
2325                 if (err == 1)
2326                         return 0;
2327         }
2328         return 0;
2329 }
2330
2331
2332 /*
2333  * ext4_ext_determine_hole - determine hole around given block
2334  * @inode:      inode we lookup in
2335  * @path:       path in extent tree to @lblk
2336  * @lblk:       pointer to logical block around which we want to determine hole
2337  *
2338  * Determine hole length (and start if easily possible) around given logical
2339  * block. We don't try too hard to find the beginning of the hole but @path
2340  * actually points to extent before @lblk, we provide it.
2341  *
2342  * The function returns the length of a hole starting at @lblk. We update @lblk
2343  * to the beginning of the hole if we managed to find it.
2344  */
2345 static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2346                                            struct ext4_ext_path *path,
2347                                            ext4_lblk_t *lblk)
2348 {
2349         int depth = ext_depth(inode);
2350         struct ext4_extent *ex;
2351         ext4_lblk_t len;
2352
2353         ex = path[depth].p_ext;
2354         if (ex == NULL) {
2355                 /* there is no extent yet, so gap is [0;-] */
2356                 *lblk = 0;
2357                 len = EXT_MAX_BLOCKS;
2358         } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2359                 len = le32_to_cpu(ex->ee_block) - *lblk;
2360         } else if (*lblk >= le32_to_cpu(ex->ee_block)
2361                         + ext4_ext_get_actual_len(ex)) {
2362                 ext4_lblk_t next;
2363
2364                 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2365                 next = ext4_ext_next_allocated_block(path);
2366                 BUG_ON(next == *lblk);
2367                 len = next - *lblk;
2368         } else {
2369                 BUG();
2370         }
2371         return len;
2372 }
2373
2374 /*
2375  * ext4_ext_put_gap_in_cache:
2376  * calculate boundaries of the gap that the requested block fits into
2377  * and cache this gap
2378  */
2379 static void
2380 ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2381                           ext4_lblk_t hole_len)
2382 {
2383         struct extent_status es;
2384
2385         ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2386                                   hole_start + hole_len - 1, &es);
2387         if (es.es_len) {
2388                 /* There's delayed extent containing lblock? */
2389                 if (es.es_lblk <= hole_start)
2390                         return;
2391                 hole_len = min(es.es_lblk - hole_start, hole_len);
2392         }
2393         ext_debug(" -> %u:%u\n", hole_start, hole_len);
2394         ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2395                               EXTENT_STATUS_HOLE);
2396 }
2397
2398 /*
2399  * ext4_ext_rm_idx:
2400  * removes index from the index block.
2401  */
2402 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2403                         struct ext4_ext_path *path, int depth)
2404 {
2405         int err;
2406         ext4_fsblk_t leaf;
2407
2408         /* free index block */
2409         depth--;
2410         path = path + depth;
2411         leaf = ext4_idx_pblock(path->p_idx);
2412         if (unlikely(path->p_hdr->eh_entries == 0)) {
2413                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2414                 return -EFSCORRUPTED;
2415         }
2416         err = ext4_ext_get_access(handle, inode, path);
2417         if (err)
2418                 return err;
2419
2420         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2421                 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2422                 len *= sizeof(struct ext4_extent_idx);
2423                 memmove(path->p_idx, path->p_idx + 1, len);
2424         }
2425
2426         le16_add_cpu(&path->p_hdr->eh_entries, -1);
2427         err = ext4_ext_dirty(handle, inode, path);
2428         if (err)
2429                 return err;
2430         ext_debug("index is empty, remove it, free block %llu\n", leaf);
2431         trace_ext4_ext_rm_idx(inode, leaf);
2432
2433         ext4_free_blocks(handle, inode, NULL, leaf, 1,
2434                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2435
2436         while (--depth >= 0) {
2437                 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2438                         break;
2439                 path--;
2440                 err = ext4_ext_get_access(handle, inode, path);
2441                 if (err)
2442                         break;
2443                 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2444                 err = ext4_ext_dirty(handle, inode, path);
2445                 if (err)
2446                         break;
2447         }
2448         return err;
2449 }
2450
2451 /*
2452  * ext4_ext_calc_credits_for_single_extent:
2453  * This routine returns max. credits that needed to insert an extent
2454  * to the extent tree.
2455  * When pass the actual path, the caller should calculate credits
2456  * under i_data_sem.
2457  */
2458 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2459                                                 struct ext4_ext_path *path)
2460 {
2461         if (path) {
2462                 int depth = ext_depth(inode);
2463                 int ret = 0;
2464
2465                 /* probably there is space in leaf? */
2466                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2467                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2468
2469                         /*
2470                          *  There are some space in the leaf tree, no
2471                          *  need to account for leaf block credit
2472                          *
2473                          *  bitmaps and block group descriptor blocks
2474                          *  and other metadata blocks still need to be
2475                          *  accounted.
2476                          */
2477                         /* 1 bitmap, 1 block group descriptor */
2478                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2479                         return ret;
2480                 }
2481         }
2482
2483         return ext4_chunk_trans_blocks(inode, nrblocks);
2484 }
2485
2486 /*
2487  * How many index/leaf blocks need to change/allocate to add @extents extents?
2488  *
2489  * If we add a single extent, then in the worse case, each tree level
2490  * index/leaf need to be changed in case of the tree split.
2491  *
2492  * If more extents are inserted, they could cause the whole tree split more
2493  * than once, but this is really rare.
2494  */
2495 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2496 {
2497         int index;
2498         int depth;
2499
2500         /* If we are converting the inline data, only one is needed here. */
2501         if (ext4_has_inline_data(inode))
2502                 return 1;
2503
2504         depth = ext_depth(inode);
2505
2506         if (extents <= 1)
2507                 index = depth * 2;
2508         else
2509                 index = depth * 3;
2510
2511         return index;
2512 }
2513
2514 static inline int get_default_free_blocks_flags(struct inode *inode)
2515 {
2516         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2517             ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2518                 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2519         else if (ext4_should_journal_data(inode))
2520                 return EXT4_FREE_BLOCKS_FORGET;
2521         return 0;
2522 }
2523
2524 /*
2525  * ext4_rereserve_cluster - increment the reserved cluster count when
2526  *                          freeing a cluster with a pending reservation
2527  *
2528  * @inode - file containing the cluster
2529  * @lblk - logical block in cluster to be reserved
2530  *
2531  * Increments the reserved cluster count and adjusts quota in a bigalloc
2532  * file system when freeing a partial cluster containing at least one
2533  * delayed and unwritten block.  A partial cluster meeting that
2534  * requirement will have a pending reservation.  If so, the
2535  * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2536  * defer reserved and allocated space accounting to a subsequent call
2537  * to this function.
2538  */
2539 static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2540 {
2541         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2542         struct ext4_inode_info *ei = EXT4_I(inode);
2543
2544         dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2545
2546         spin_lock(&ei->i_block_reservation_lock);
2547         ei->i_reserved_data_blocks++;
2548         percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2549         spin_unlock(&ei->i_block_reservation_lock);
2550
2551         percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2552         ext4_remove_pending(inode, lblk);
2553 }
2554
2555 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2556                               struct ext4_extent *ex,
2557                               struct partial_cluster *partial,
2558                               ext4_lblk_t from, ext4_lblk_t to)
2559 {
2560         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2561         unsigned short ee_len = ext4_ext_get_actual_len(ex);
2562         ext4_fsblk_t last_pblk, pblk;
2563         ext4_lblk_t num;
2564         int flags;
2565
2566         /* only extent tail removal is allowed */
2567         if (from < le32_to_cpu(ex->ee_block) ||
2568             to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2569                 ext4_error(sbi->s_sb,
2570                            "strange request: removal(2) %u-%u from %u:%u",
2571                            from, to, le32_to_cpu(ex->ee_block), ee_len);
2572                 return 0;
2573         }
2574
2575 #ifdef EXTENTS_STATS
2576         spin_lock(&sbi->s_ext_stats_lock);
2577         sbi->s_ext_blocks += ee_len;
2578         sbi->s_ext_extents++;
2579         if (ee_len < sbi->s_ext_min)
2580                 sbi->s_ext_min = ee_len;
2581         if (ee_len > sbi->s_ext_max)
2582                 sbi->s_ext_max = ee_len;
2583         if (ext_depth(inode) > sbi->s_depth_max)
2584                 sbi->s_depth_max = ext_depth(inode);
2585         spin_unlock(&sbi->s_ext_stats_lock);
2586 #endif
2587
2588         trace_ext4_remove_blocks(inode, ex, from, to, partial);
2589
2590         /*
2591          * if we have a partial cluster, and it's different from the
2592          * cluster of the last block in the extent, we free it
2593          */
2594         last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2595
2596         if (partial->state != initial &&
2597             partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2598                 if (partial->state == tofree) {
2599                         flags = get_default_free_blocks_flags(inode);
2600                         if (ext4_is_pending(inode, partial->lblk))
2601                                 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2602                         ext4_free_blocks(handle, inode, NULL,
2603                                          EXT4_C2B(sbi, partial->pclu),
2604                                          sbi->s_cluster_ratio, flags);
2605                         if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2606                                 ext4_rereserve_cluster(inode, partial->lblk);
2607                 }
2608                 partial->state = initial;
2609         }
2610
2611         num = le32_to_cpu(ex->ee_block) + ee_len - from;
2612         pblk = ext4_ext_pblock(ex) + ee_len - num;
2613
2614         /*
2615          * We free the partial cluster at the end of the extent (if any),
2616          * unless the cluster is used by another extent (partial_cluster
2617          * state is nofree).  If a partial cluster exists here, it must be
2618          * shared with the last block in the extent.
2619          */
2620         flags = get_default_free_blocks_flags(inode);
2621
2622         /* partial, left end cluster aligned, right end unaligned */
2623         if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2624             (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2625             (partial->state != nofree)) {
2626                 if (ext4_is_pending(inode, to))
2627                         flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2628                 ext4_free_blocks(handle, inode, NULL,
2629                                  EXT4_PBLK_CMASK(sbi, last_pblk),
2630                                  sbi->s_cluster_ratio, flags);
2631                 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2632                         ext4_rereserve_cluster(inode, to);
2633                 partial->state = initial;
2634                 flags = get_default_free_blocks_flags(inode);
2635         }
2636
2637         flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2638
2639         /*
2640          * For bigalloc file systems, we never free a partial cluster
2641          * at the beginning of the extent.  Instead, we check to see if we
2642          * need to free it on a subsequent call to ext4_remove_blocks,
2643          * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2644          */
2645         flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2646         ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2647
2648         /* reset the partial cluster if we've freed past it */
2649         if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2650                 partial->state = initial;
2651
2652         /*
2653          * If we've freed the entire extent but the beginning is not left
2654          * cluster aligned and is not marked as ineligible for freeing we
2655          * record the partial cluster at the beginning of the extent.  It
2656          * wasn't freed by the preceding ext4_free_blocks() call, and we
2657          * need to look farther to the left to determine if it's to be freed
2658          * (not shared with another extent). Else, reset the partial
2659          * cluster - we're either  done freeing or the beginning of the
2660          * extent is left cluster aligned.
2661          */
2662         if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2663                 if (partial->state == initial) {
2664                         partial->pclu = EXT4_B2C(sbi, pblk);
2665                         partial->lblk = from;
2666                         partial->state = tofree;
2667                 }
2668         } else {
2669                 partial->state = initial;
2670         }
2671
2672         return 0;
2673 }
2674
2675 /*
2676  * ext4_ext_rm_leaf() Removes the extents associated with the
2677  * blocks appearing between "start" and "end".  Both "start"
2678  * and "end" must appear in the same extent or EIO is returned.
2679  *
2680  * @handle: The journal handle
2681  * @inode:  The files inode
2682  * @path:   The path to the leaf
2683  * @partial_cluster: The cluster which we'll have to free if all extents
2684  *                   has been released from it.  However, if this value is
2685  *                   negative, it's a cluster just to the right of the
2686  *                   punched region and it must not be freed.
2687  * @start:  The first block to remove
2688  * @end:   The last block to remove
2689  */
2690 static int
2691 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2692                  struct ext4_ext_path *path,
2693                  struct partial_cluster *partial,
2694                  ext4_lblk_t start, ext4_lblk_t end)
2695 {
2696         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2697         int err = 0, correct_index = 0;
2698         int depth = ext_depth(inode), credits, revoke_credits;
2699         struct ext4_extent_header *eh;
2700         ext4_lblk_t a, b;
2701         unsigned num;
2702         ext4_lblk_t ex_ee_block;
2703         unsigned short ex_ee_len;
2704         unsigned unwritten = 0;
2705         struct ext4_extent *ex;
2706         ext4_fsblk_t pblk;
2707
2708         /* the header must be checked already in ext4_ext_remove_space() */
2709         ext_debug("truncate since %u in leaf to %u\n", start, end);
2710         if (!path[depth].p_hdr)
2711                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2712         eh = path[depth].p_hdr;
2713         if (unlikely(path[depth].p_hdr == NULL)) {
2714                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2715                 return -EFSCORRUPTED;
2716         }
2717         /* find where to start removing */
2718         ex = path[depth].p_ext;
2719         if (!ex)
2720                 ex = EXT_LAST_EXTENT(eh);
2721
2722         ex_ee_block = le32_to_cpu(ex->ee_block);
2723         ex_ee_len = ext4_ext_get_actual_len(ex);
2724
2725         trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2726
2727         while (ex >= EXT_FIRST_EXTENT(eh) &&
2728                         ex_ee_block + ex_ee_len > start) {
2729
2730                 if (ext4_ext_is_unwritten(ex))
2731                         unwritten = 1;
2732                 else
2733                         unwritten = 0;
2734
2735                 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2736                           unwritten, ex_ee_len);
2737                 path[depth].p_ext = ex;
2738
2739                 a = ex_ee_block > start ? ex_ee_block : start;
2740                 b = ex_ee_block+ex_ee_len - 1 < end ?
2741                         ex_ee_block+ex_ee_len - 1 : end;
2742
2743                 ext_debug("  border %u:%u\n", a, b);
2744
2745                 /* If this extent is beyond the end of the hole, skip it */
2746                 if (end < ex_ee_block) {
2747                         /*
2748                          * We're going to skip this extent and move to another,
2749                          * so note that its first cluster is in use to avoid
2750                          * freeing it when removing blocks.  Eventually, the
2751                          * right edge of the truncated/punched region will
2752                          * be just to the left.
2753                          */
2754                         if (sbi->s_cluster_ratio > 1) {
2755                                 pblk = ext4_ext_pblock(ex);
2756                                 partial->pclu = EXT4_B2C(sbi, pblk);
2757                                 partial->state = nofree;
2758                         }
2759                         ex--;
2760                         ex_ee_block = le32_to_cpu(ex->ee_block);
2761                         ex_ee_len = ext4_ext_get_actual_len(ex);
2762                         continue;
2763                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2764                         EXT4_ERROR_INODE(inode,
2765                                          "can not handle truncate %u:%u "
2766                                          "on extent %u:%u",
2767                                          start, end, ex_ee_block,
2768                                          ex_ee_block + ex_ee_len - 1);
2769                         err = -EFSCORRUPTED;
2770                         goto out;
2771                 } else if (a != ex_ee_block) {
2772                         /* remove tail of the extent */
2773                         num = a - ex_ee_block;
2774                 } else {
2775                         /* remove whole extent: excellent! */
2776                         num = 0;
2777                 }
2778                 /*
2779                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2780                  * descriptor) for each block group; assume two block
2781                  * groups plus ex_ee_len/blocks_per_block_group for
2782                  * the worst case
2783                  */
2784                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2785                 if (ex == EXT_FIRST_EXTENT(eh)) {
2786                         correct_index = 1;
2787                         credits += (ext_depth(inode)) + 1;
2788                 }
2789                 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2790                 /*
2791                  * We may end up freeing some index blocks and data from the
2792                  * punched range. Note that partial clusters are accounted for
2793                  * by ext4_free_data_revoke_credits().
2794                  */
2795                 revoke_credits =
2796                         ext4_free_metadata_revoke_credits(inode->i_sb,
2797                                                           ext_depth(inode)) +
2798                         ext4_free_data_revoke_credits(inode, b - a + 1);
2799
2800                 err = ext4_datasem_ensure_credits(handle, inode, credits,
2801                                                   credits, revoke_credits);
2802                 if (err) {
2803                         if (err > 0)
2804                                 err = -EAGAIN;
2805                         goto out;
2806                 }
2807
2808                 err = ext4_ext_get_access(handle, inode, path + depth);
2809                 if (err)
2810                         goto out;
2811
2812                 err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2813                 if (err)
2814                         goto out;
2815
2816                 if (num == 0)
2817                         /* this extent is removed; mark slot entirely unused */
2818                         ext4_ext_store_pblock(ex, 0);
2819
2820                 ex->ee_len = cpu_to_le16(num);
2821                 /*
2822                  * Do not mark unwritten if all the blocks in the
2823                  * extent have been removed.
2824                  */
2825                 if (unwritten && num)
2826                         ext4_ext_mark_unwritten(ex);
2827                 /*
2828                  * If the extent was completely released,
2829                  * we need to remove it from the leaf
2830                  */
2831                 if (num == 0) {
2832                         if (end != EXT_MAX_BLOCKS - 1) {
2833                                 /*
2834                                  * For hole punching, we need to scoot all the
2835                                  * extents up when an extent is removed so that
2836                                  * we dont have blank extents in the middle
2837                                  */
2838                                 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2839                                         sizeof(struct ext4_extent));
2840
2841                                 /* Now get rid of the one at the end */
2842                                 memset(EXT_LAST_EXTENT(eh), 0,
2843                                         sizeof(struct ext4_extent));
2844                         }
2845                         le16_add_cpu(&eh->eh_entries, -1);
2846                 }
2847
2848                 err = ext4_ext_dirty(handle, inode, path + depth);
2849                 if (err)
2850                         goto out;
2851
2852                 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2853                                 ext4_ext_pblock(ex));
2854                 ex--;
2855                 ex_ee_block = le32_to_cpu(ex->ee_block);
2856                 ex_ee_len = ext4_ext_get_actual_len(ex);
2857         }
2858
2859         if (correct_index && eh->eh_entries)
2860                 err = ext4_ext_correct_indexes(handle, inode, path);
2861
2862         /*
2863          * If there's a partial cluster and at least one extent remains in
2864          * the leaf, free the partial cluster if it isn't shared with the
2865          * current extent.  If it is shared with the current extent
2866          * we reset the partial cluster because we've reached the start of the
2867          * truncated/punched region and we're done removing blocks.
2868          */
2869         if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2870                 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2871                 if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2872                         int flags = get_default_free_blocks_flags(inode);
2873
2874                         if (ext4_is_pending(inode, partial->lblk))
2875                                 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2876                         ext4_free_blocks(handle, inode, NULL,
2877                                          EXT4_C2B(sbi, partial->pclu),
2878                                          sbi->s_cluster_ratio, flags);
2879                         if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2880                                 ext4_rereserve_cluster(inode, partial->lblk);
2881                 }
2882                 partial->state = initial;
2883         }
2884
2885         /* if this leaf is free, then we should
2886          * remove it from index block above */
2887         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2888                 err = ext4_ext_rm_idx(handle, inode, path, depth);
2889
2890 out:
2891         return err;
2892 }
2893
2894 /*
2895  * ext4_ext_more_to_rm:
2896  * returns 1 if current index has to be freed (even partial)
2897  */
2898 static int
2899 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2900 {
2901         BUG_ON(path->p_idx == NULL);
2902
2903         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2904                 return 0;
2905
2906         /*
2907          * if truncate on deeper level happened, it wasn't partial,
2908          * so we have to consider current index for truncation
2909          */
2910         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2911                 return 0;
2912         return 1;
2913 }
2914
2915 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2916                           ext4_lblk_t end)
2917 {
2918         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2919         int depth = ext_depth(inode);
2920         struct ext4_ext_path *path = NULL;
2921         struct partial_cluster partial;
2922         handle_t *handle;
2923         int i = 0, err = 0;
2924
2925         partial.pclu = 0;
2926         partial.lblk = 0;
2927         partial.state = initial;
2928
2929         ext_debug("truncate since %u to %u\n", start, end);
2930
2931         /* probably first extent we're gonna free will be last in block */
2932         handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2933                         depth + 1,
2934                         ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2935         if (IS_ERR(handle))
2936                 return PTR_ERR(handle);
2937
2938 again:
2939         trace_ext4_ext_remove_space(inode, start, end, depth);
2940
2941         /*
2942          * Check if we are removing extents inside the extent tree. If that
2943          * is the case, we are going to punch a hole inside the extent tree
2944          * so we have to check whether we need to split the extent covering
2945          * the last block to remove so we can easily remove the part of it
2946          * in ext4_ext_rm_leaf().
2947          */
2948         if (end < EXT_MAX_BLOCKS - 1) {
2949                 struct ext4_extent *ex;
2950                 ext4_lblk_t ee_block, ex_end, lblk;
2951                 ext4_fsblk_t pblk;
2952
2953                 /* find extent for or closest extent to this block */
2954                 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2955                 if (IS_ERR(path)) {
2956                         ext4_journal_stop(handle);
2957                         return PTR_ERR(path);
2958                 }
2959                 depth = ext_depth(inode);
2960                 /* Leaf not may not exist only if inode has no blocks at all */
2961                 ex = path[depth].p_ext;
2962                 if (!ex) {
2963                         if (depth) {
2964                                 EXT4_ERROR_INODE(inode,
2965                                                  "path[%d].p_hdr == NULL",
2966                                                  depth);
2967                                 err = -EFSCORRUPTED;
2968                         }
2969                         goto out;
2970                 }
2971
2972                 ee_block = le32_to_cpu(ex->ee_block);
2973                 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2974
2975                 /*
2976                  * See if the last block is inside the extent, if so split
2977                  * the extent at 'end' block so we can easily remove the
2978                  * tail of the first part of the split extent in
2979                  * ext4_ext_rm_leaf().
2980                  */
2981                 if (end >= ee_block && end < ex_end) {
2982
2983                         /*
2984                          * If we're going to split the extent, note that
2985                          * the cluster containing the block after 'end' is
2986                          * in use to avoid freeing it when removing blocks.
2987                          */
2988                         if (sbi->s_cluster_ratio > 1) {
2989                                 pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2990                                 partial.pclu = EXT4_B2C(sbi, pblk);
2991                                 partial.state = nofree;
2992                         }
2993
2994                         /*
2995                          * Split the extent in two so that 'end' is the last
2996                          * block in the first new extent. Also we should not
2997                          * fail removing space due to ENOSPC so try to use
2998                          * reserved block if that happens.
2999                          */
3000                         err = ext4_force_split_extent_at(handle, inode, &path,
3001                                                          end + 1, 1);
3002                         if (err < 0)
3003                                 goto out;
3004
3005                 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
3006                            partial.state == initial) {
3007                         /*
3008                          * If we're punching, there's an extent to the right.
3009                          * If the partial cluster hasn't been set, set it to
3010                          * that extent's first cluster and its state to nofree
3011                          * so it won't be freed should it contain blocks to be
3012                          * removed. If it's already set (tofree/nofree), we're
3013                          * retrying and keep the original partial cluster info
3014                          * so a cluster marked tofree as a result of earlier
3015                          * extent removal is not lost.
3016                          */
3017                         lblk = ex_end + 1;
3018                         err = ext4_ext_search_right(inode, path, &lblk, &pblk,
3019                                                     &ex);
3020                         if (err)
3021                                 goto out;
3022                         if (pblk) {
3023                                 partial.pclu = EXT4_B2C(sbi, pblk);
3024                                 partial.state = nofree;
3025                         }
3026                 }
3027         }
3028         /*
3029          * We start scanning from right side, freeing all the blocks
3030          * after i_size and walking into the tree depth-wise.
3031          */
3032         depth = ext_depth(inode);
3033         if (path) {
3034                 int k = i = depth;
3035                 while (--k > 0)
3036                         path[k].p_block =
3037                                 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
3038         } else {
3039                 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
3040                                GFP_NOFS);
3041                 if (path == NULL) {
3042                         ext4_journal_stop(handle);
3043                         return -ENOMEM;
3044                 }
3045                 path[0].p_maxdepth = path[0].p_depth = depth;
3046                 path[0].p_hdr = ext_inode_hdr(inode);
3047                 i = 0;
3048
3049                 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
3050                         err = -EFSCORRUPTED;
3051                         goto out;
3052                 }
3053         }
3054         err = 0;
3055
3056         while (i >= 0 && err == 0) {
3057                 if (i == depth) {
3058                         /* this is leaf block */
3059                         err = ext4_ext_rm_leaf(handle, inode, path,
3060                                                &partial, start, end);
3061                         /* root level has p_bh == NULL, brelse() eats this */
3062                         brelse(path[i].p_bh);
3063                         path[i].p_bh = NULL;
3064                         i--;
3065                         continue;
3066                 }
3067
3068                 /* this is index block */
3069                 if (!path[i].p_hdr) {
3070                         ext_debug("initialize header\n");
3071                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
3072                 }
3073
3074                 if (!path[i].p_idx) {
3075                         /* this level hasn't been touched yet */
3076                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
3077                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
3078                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
3079                                   path[i].p_hdr,
3080                                   le16_to_cpu(path[i].p_hdr->eh_entries));
3081                 } else {
3082                         /* we were already here, see at next index */
3083                         path[i].p_idx--;
3084                 }
3085
3086                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
3087                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
3088                                 path[i].p_idx);
3089                 if (ext4_ext_more_to_rm(path + i)) {
3090                         struct buffer_head *bh;
3091                         /* go to the next level */
3092                         ext_debug("move to level %d (block %llu)\n",
3093                                   i + 1, ext4_idx_pblock(path[i].p_idx));
3094                         memset(path + i + 1, 0, sizeof(*path));
3095                         bh = read_extent_tree_block(inode,
3096                                 ext4_idx_pblock(path[i].p_idx), depth - i - 1,
3097                                 EXT4_EX_NOCACHE);
3098                         if (IS_ERR(bh)) {
3099                                 /* should we reset i_size? */
3100                                 err = PTR_ERR(bh);
3101                                 break;
3102                         }
3103                         /* Yield here to deal with large extent trees.
3104                          * Should be a no-op if we did IO above. */
3105                         cond_resched();
3106                         if (WARN_ON(i + 1 > depth)) {
3107                                 err = -EFSCORRUPTED;
3108                                 break;
3109                         }
3110                         path[i + 1].p_bh = bh;
3111
3112                         /* save actual number of indexes since this
3113                          * number is changed at the next iteration */
3114                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
3115                         i++;
3116                 } else {
3117                         /* we finished processing this index, go up */
3118                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
3119                                 /* index is empty, remove it;
3120                                  * handle must be already prepared by the
3121                                  * truncatei_leaf() */
3122                                 err = ext4_ext_rm_idx(handle, inode, path, i);
3123                         }
3124                         /* root level has p_bh == NULL, brelse() eats this */
3125                         brelse(path[i].p_bh);
3126                         path[i].p_bh = NULL;
3127                         i--;
3128                         ext_debug("return to level %d\n", i);
3129                 }
3130         }
3131
3132         trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3133                                          path->p_hdr->eh_entries);
3134
3135         /*
3136          * if there's a partial cluster and we have removed the first extent
3137          * in the file, then we also free the partial cluster, if any
3138          */
3139         if (partial.state == tofree && err == 0) {
3140                 int flags = get_default_free_blocks_flags(inode);
3141
3142                 if (ext4_is_pending(inode, partial.lblk))
3143                         flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3144                 ext4_free_blocks(handle, inode, NULL,
3145                                  EXT4_C2B(sbi, partial.pclu),
3146                                  sbi->s_cluster_ratio, flags);
3147                 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3148                         ext4_rereserve_cluster(inode, partial.lblk);
3149                 partial.state = initial;
3150         }
3151
3152         /* TODO: flexible tree reduction should be here */
3153         if (path->p_hdr->eh_entries == 0) {
3154                 /*
3155                  * truncate to zero freed all the tree,
3156                  * so we need to correct eh_depth
3157                  */
3158                 err = ext4_ext_get_access(handle, inode, path);
3159                 if (err == 0) {
3160                         ext_inode_hdr(inode)->eh_depth = 0;
3161                         ext_inode_hdr(inode)->eh_max =
3162                                 cpu_to_le16(ext4_ext_space_root(inode, 0));
3163                         err = ext4_ext_dirty(handle, inode, path);
3164                 }
3165         }
3166 out:
3167         ext4_ext_drop_refs(path);
3168         kfree(path);
3169         path = NULL;
3170         if (err == -EAGAIN)
3171                 goto again;
3172         ext4_journal_stop(handle);
3173
3174         return err;
3175 }
3176
3177 /*
3178  * called at mount time
3179  */
3180 void ext4_ext_init(struct super_block *sb)
3181 {
3182         /*
3183          * possible initialization would be here
3184          */
3185
3186         if (ext4_has_feature_extents(sb)) {
3187 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3188                 printk(KERN_INFO "EXT4-fs: file extents enabled"
3189 #ifdef AGGRESSIVE_TEST
3190                        ", aggressive tests"
3191 #endif
3192 #ifdef CHECK_BINSEARCH
3193                        ", check binsearch"
3194 #endif
3195 #ifdef EXTENTS_STATS
3196                        ", stats"
3197 #endif
3198                        "\n");
3199 #endif
3200 #ifdef EXTENTS_STATS
3201                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3202                 EXT4_SB(sb)->s_ext_min = 1 << 30;
3203                 EXT4_SB(sb)->s_ext_max = 0;
3204 #endif
3205         }
3206 }
3207
3208 /*
3209  * called at umount time
3210  */
3211 void ext4_ext_release(struct super_block *sb)
3212 {
3213         if (!ext4_has_feature_extents(sb))
3214                 return;
3215
3216 #ifdef EXTENTS_STATS
3217         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3218                 struct ext4_sb_info *sbi = EXT4_SB(sb);
3219                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3220                         sbi->s_ext_blocks, sbi->s_ext_extents,
3221                         sbi->s_ext_blocks / sbi->s_ext_extents);
3222                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3223                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3224         }
3225 #endif
3226 }
3227
3228 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3229 {
3230         ext4_lblk_t  ee_block;
3231         ext4_fsblk_t ee_pblock;
3232         unsigned int ee_len;
3233
3234         ee_block  = le32_to_cpu(ex->ee_block);
3235         ee_len    = ext4_ext_get_actual_len(ex);
3236         ee_pblock = ext4_ext_pblock(ex);
3237
3238         if (ee_len == 0)
3239                 return 0;
3240
3241         return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3242                                      EXTENT_STATUS_WRITTEN);
3243 }
3244
3245 /* FIXME!! we need to try to merge to left or right after zero-out  */
3246 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3247 {
3248         ext4_fsblk_t ee_pblock;
3249         unsigned int ee_len;
3250
3251         ee_len    = ext4_ext_get_actual_len(ex);
3252         ee_pblock = ext4_ext_pblock(ex);
3253         return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3254                                   ee_len);
3255 }
3256
3257 /*
3258  * ext4_split_extent_at() splits an extent at given block.
3259  *
3260  * @handle: the journal handle
3261  * @inode: the file inode
3262  * @path: the path to the extent
3263  * @split: the logical block where the extent is splitted.
3264  * @split_flags: indicates if the extent could be zeroout if split fails, and
3265  *               the states(init or unwritten) of new extents.
3266  * @flags: flags used to insert new extent to extent tree.
3267  *
3268  *
3269  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3270  * of which are deterimined by split_flag.
3271  *
3272  * There are two cases:
3273  *  a> the extent are splitted into two extent.
3274  *  b> split is not needed, and just mark the extent.
3275  *
3276  * return 0 on success.
3277  */
3278 static int ext4_split_extent_at(handle_t *handle,
3279                              struct inode *inode,
3280                              struct ext4_ext_path **ppath,
3281                              ext4_lblk_t split,
3282                              int split_flag,
3283                              int flags)
3284 {
3285         struct ext4_ext_path *path = *ppath;
3286         ext4_fsblk_t newblock;
3287         ext4_lblk_t ee_block;
3288         struct ext4_extent *ex, newex, orig_ex, zero_ex;
3289         struct ext4_extent *ex2 = NULL;
3290         unsigned int ee_len, depth;
3291         int err = 0;
3292
3293         BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3294                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3295
3296         ext_debug("ext4_split_extents_at: inode %lu, logical"
3297                 "block %llu\n", inode->i_ino, (unsigned long long)split);
3298
3299         ext4_ext_show_leaf(inode, path);
3300
3301         depth = ext_depth(inode);
3302         ex = path[depth].p_ext;
3303         ee_block = le32_to_cpu(ex->ee_block);
3304         ee_len = ext4_ext_get_actual_len(ex);
3305         newblock = split - ee_block + ext4_ext_pblock(ex);
3306
3307         BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3308         BUG_ON(!ext4_ext_is_unwritten(ex) &&
3309                split_flag & (EXT4_EXT_MAY_ZEROOUT |
3310                              EXT4_EXT_MARK_UNWRIT1 |
3311                              EXT4_EXT_MARK_UNWRIT2));
3312
3313         err = ext4_ext_get_access(handle, inode, path + depth);
3314         if (err)
3315                 goto out;
3316
3317         if (split == ee_block) {
3318                 /*
3319                  * case b: block @split is the block that the extent begins with
3320                  * then we just change the state of the extent, and splitting
3321                  * is not needed.
3322                  */
3323                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3324                         ext4_ext_mark_unwritten(ex);
3325                 else
3326                         ext4_ext_mark_initialized(ex);
3327
3328                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3329                         ext4_ext_try_to_merge(handle, inode, path, ex);
3330
3331                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3332                 goto out;
3333         }
3334
3335         /* case a */
3336         memcpy(&orig_ex, ex, sizeof(orig_ex));
3337         ex->ee_len = cpu_to_le16(split - ee_block);
3338         if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3339                 ext4_ext_mark_unwritten(ex);
3340
3341         /*
3342          * path may lead to new leaf, not to original leaf any more
3343          * after ext4_ext_insert_extent() returns,
3344          */
3345         err = ext4_ext_dirty(handle, inode, path + depth);
3346         if (err)
3347                 goto fix_extent_len;
3348
3349         ex2 = &newex;
3350         ex2->ee_block = cpu_to_le32(split);
3351         ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3352         ext4_ext_store_pblock(ex2, newblock);
3353         if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3354                 ext4_ext_mark_unwritten(ex2);
3355
3356         err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3357         if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3358                 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3359                         if (split_flag & EXT4_EXT_DATA_VALID1) {
3360                                 err = ext4_ext_zeroout(inode, ex2);
3361                                 zero_ex.ee_block = ex2->ee_block;
3362                                 zero_ex.ee_len = cpu_to_le16(
3363                                                 ext4_ext_get_actual_len(ex2));
3364                                 ext4_ext_store_pblock(&zero_ex,
3365                                                       ext4_ext_pblock(ex2));
3366                         } else {
3367                                 err = ext4_ext_zeroout(inode, ex);
3368                                 zero_ex.ee_block = ex->ee_block;
3369                                 zero_ex.ee_len = cpu_to_le16(
3370                                                 ext4_ext_get_actual_len(ex));
3371                                 ext4_ext_store_pblock(&zero_ex,
3372                                                       ext4_ext_pblock(ex));
3373                         }
3374                 } else {
3375                         err = ext4_ext_zeroout(inode, &orig_ex);
3376                         zero_ex.ee_block = orig_ex.ee_block;
3377                         zero_ex.ee_len = cpu_to_le16(
3378                                                 ext4_ext_get_actual_len(&orig_ex));
3379                         ext4_ext_store_pblock(&zero_ex,
3380                                               ext4_ext_pblock(&orig_ex));
3381                 }
3382
3383                 if (err)
3384                         goto fix_extent_len;
3385                 /* update the extent length and mark as initialized */
3386                 ex->ee_len = cpu_to_le16(ee_len);
3387                 ext4_ext_try_to_merge(handle, inode, path, ex);
3388                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3389                 if (err)
3390                         goto fix_extent_len;
3391
3392                 /* update extent status tree */
3393                 err = ext4_zeroout_es(inode, &zero_ex);
3394
3395                 goto out;
3396         } else if (err)
3397                 goto fix_extent_len;
3398
3399 out:
3400         ext4_ext_show_leaf(inode, path);
3401         return err;
3402
3403 fix_extent_len:
3404         ex->ee_len = orig_ex.ee_len;
3405         ext4_ext_dirty(handle, inode, path + path->p_depth);
3406         return err;
3407 }
3408
3409 /*
3410  * ext4_split_extents() splits an extent and mark extent which is covered
3411  * by @map as split_flags indicates
3412  *
3413  * It may result in splitting the extent into multiple extents (up to three)
3414  * There are three possibilities:
3415  *   a> There is no split required
3416  *   b> Splits in two extents: Split is happening at either end of the extent
3417  *   c> Splits in three extents: Somone is splitting in middle of the extent
3418  *
3419  */
3420 static int ext4_split_extent(handle_t *handle,
3421                               struct inode *inode,
3422                               struct ext4_ext_path **ppath,
3423                               struct ext4_map_blocks *map,
3424                               int split_flag,
3425                               int flags)
3426 {
3427         struct ext4_ext_path *path = *ppath;
3428         ext4_lblk_t ee_block;
3429         struct ext4_extent *ex;
3430         unsigned int ee_len, depth;
3431         int err = 0;
3432         int unwritten;
3433         int split_flag1, flags1;
3434         int allocated = map->m_len;
3435
3436         depth = ext_depth(inode);
3437         ex = path[depth].p_ext;
3438         ee_block = le32_to_cpu(ex->ee_block);
3439         ee_len = ext4_ext_get_actual_len(ex);
3440         unwritten = ext4_ext_is_unwritten(ex);
3441
3442         if (map->m_lblk + map->m_len < ee_block + ee_len) {
3443                 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3444                 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3445                 if (unwritten)
3446                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3447                                        EXT4_EXT_MARK_UNWRIT2;
3448                 if (split_flag & EXT4_EXT_DATA_VALID2)
3449                         split_flag1 |= EXT4_EXT_DATA_VALID1;
3450                 err = ext4_split_extent_at(handle, inode, ppath,
3451                                 map->m_lblk + map->m_len, split_flag1, flags1);
3452                 if (err)
3453                         goto out;
3454         } else {
3455                 allocated = ee_len - (map->m_lblk - ee_block);
3456         }
3457         /*
3458          * Update path is required because previous ext4_split_extent_at() may
3459          * result in split of original leaf or extent zeroout.
3460          */
3461         path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3462         if (IS_ERR(path))
3463                 return PTR_ERR(path);
3464         depth = ext_depth(inode);
3465         ex = path[depth].p_ext;
3466         if (!ex) {
3467                 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3468                                  (unsigned long) map->m_lblk);
3469                 return -EFSCORRUPTED;
3470         }
3471         unwritten = ext4_ext_is_unwritten(ex);
3472         split_flag1 = 0;
3473
3474         if (map->m_lblk >= ee_block) {
3475                 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3476                 if (unwritten) {
3477                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3478                         split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3479                                                      EXT4_EXT_MARK_UNWRIT2);
3480                 }
3481                 err = ext4_split_extent_at(handle, inode, ppath,
3482                                 map->m_lblk, split_flag1, flags);
3483                 if (err)
3484                         goto out;
3485         }
3486
3487         ext4_ext_show_leaf(inode, path);
3488 out:
3489         return err ? err : allocated;
3490 }
3491
3492 /*
3493  * This function is called by ext4_ext_map_blocks() if someone tries to write
3494  * to an unwritten extent. It may result in splitting the unwritten
3495  * extent into multiple extents (up to three - one initialized and two
3496  * unwritten).
3497  * There are three possibilities:
3498  *   a> There is no split required: Entire extent should be initialized
3499  *   b> Splits in two extents: Write is happening at either end of the extent
3500  *   c> Splits in three extents: Somone is writing in middle of the extent
3501  *
3502  * Pre-conditions:
3503  *  - The extent pointed to by 'path' is unwritten.
3504  *  - The extent pointed to by 'path' contains a superset
3505  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3506  *
3507  * Post-conditions on success:
3508  *  - the returned value is the number of blocks beyond map->l_lblk
3509  *    that are allocated and initialized.
3510  *    It is guaranteed to be >= map->m_len.
3511  */
3512 static int ext4_ext_convert_to_initialized(handle_t *handle,
3513                                            struct inode *inode,
3514                                            struct ext4_map_blocks *map,
3515                                            struct ext4_ext_path **ppath,
3516                                            int flags)
3517 {
3518         struct ext4_ext_path *path = *ppath;
3519         struct ext4_sb_info *sbi;
3520         struct ext4_extent_header *eh;
3521         struct ext4_map_blocks split_map;
3522         struct ext4_extent zero_ex1, zero_ex2;
3523         struct ext4_extent *ex, *abut_ex;
3524         ext4_lblk_t ee_block, eof_block;
3525         unsigned int ee_len, depth, map_len = map->m_len;
3526         int allocated = 0, max_zeroout = 0;
3527         int err = 0;
3528         int split_flag = EXT4_EXT_DATA_VALID2;
3529
3530         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3531                 "block %llu, max_blocks %u\n", inode->i_ino,
3532                 (unsigned long long)map->m_lblk, map_len);
3533
3534         sbi = EXT4_SB(inode->i_sb);
3535         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3536                 inode->i_sb->s_blocksize_bits;
3537         if (eof_block < map->m_lblk + map_len)
3538                 eof_block = map->m_lblk + map_len;
3539
3540         depth = ext_depth(inode);
3541         eh = path[depth].p_hdr;
3542         ex = path[depth].p_ext;
3543         ee_block = le32_to_cpu(ex->ee_block);
3544         ee_len = ext4_ext_get_actual_len(ex);
3545         zero_ex1.ee_len = 0;
3546         zero_ex2.ee_len = 0;
3547
3548         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3549
3550         /* Pre-conditions */
3551         BUG_ON(!ext4_ext_is_unwritten(ex));
3552         BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3553
3554         /*
3555          * Attempt to transfer newly initialized blocks from the currently
3556          * unwritten extent to its neighbor. This is much cheaper
3557          * than an insertion followed by a merge as those involve costly
3558          * memmove() calls. Transferring to the left is the common case in
3559          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3560          * followed by append writes.
3561          *
3562          * Limitations of the current logic:
3563          *  - L1: we do not deal with writes covering the whole extent.
3564          *    This would require removing the extent if the transfer
3565          *    is possible.
3566          *  - L2: we only attempt to merge with an extent stored in the
3567          *    same extent tree node.
3568          */
3569         if ((map->m_lblk == ee_block) &&
3570                 /* See if we can merge left */
3571                 (map_len < ee_len) &&           /*L1*/
3572                 (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3573                 ext4_lblk_t prev_lblk;
3574                 ext4_fsblk_t prev_pblk, ee_pblk;
3575                 unsigned int prev_len;
3576
3577                 abut_ex = ex - 1;
3578                 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3579                 prev_len = ext4_ext_get_actual_len(abut_ex);
3580                 prev_pblk = ext4_ext_pblock(abut_ex);
3581                 ee_pblk = ext4_ext_pblock(ex);
3582
3583                 /*
3584                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3585                  * upon those conditions:
3586                  * - C1: abut_ex is initialized,
3587                  * - C2: abut_ex is logically abutting ex,
3588                  * - C3: abut_ex is physically abutting ex,
3589                  * - C4: abut_ex can receive the additional blocks without
3590                  *   overflowing the (initialized) length limit.
3591                  */
3592                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3593                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3594                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3595                         (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3596                         err = ext4_ext_get_access(handle, inode, path + depth);
3597                         if (err)
3598                                 goto out;
3599
3600                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3601                                 map, ex, abut_ex);
3602
3603                         /* Shift the start of ex by 'map_len' blocks */
3604                         ex->ee_block = cpu_to_le32(ee_block + map_len);
3605                         ext4_ext_store_pblock(ex, ee_pblk + map_len);
3606                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3607                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3608
3609                         /* Extend abut_ex by 'map_len' blocks */
3610                         abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3611
3612                         /* Result: number of initialized blocks past m_lblk */
3613                         allocated = map_len;
3614                 }
3615         } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3616                    (map_len < ee_len) &&        /*L1*/
3617                    ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3618                 /* See if we can merge right */
3619                 ext4_lblk_t next_lblk;
3620                 ext4_fsblk_t next_pblk, ee_pblk;
3621                 unsigned int next_len;
3622
3623                 abut_ex = ex + 1;
3624                 next_lblk = le32_to_cpu(abut_ex->ee_block);
3625                 next_len = ext4_ext_get_actual_len(abut_ex);
3626                 next_pblk = ext4_ext_pblock(abut_ex);
3627                 ee_pblk = ext4_ext_pblock(ex);
3628
3629                 /*
3630                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3631                  * upon those conditions:
3632                  * - C1: abut_ex is initialized,
3633                  * - C2: abut_ex is logically abutting ex,
3634                  * - C3: abut_ex is physically abutting ex,
3635                  * - C4: abut_ex can receive the additional blocks without
3636                  *   overflowing the (initialized) length limit.
3637                  */
3638                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3639                     ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3640                     ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3641                     (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3642                         err = ext4_ext_get_access(handle, inode, path + depth);
3643                         if (err)
3644                                 goto out;
3645
3646                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3647                                 map, ex, abut_ex);
3648
3649                         /* Shift the start of abut_ex by 'map_len' blocks */
3650                         abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3651                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3652                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3653                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3654
3655                         /* Extend abut_ex by 'map_len' blocks */
3656                         abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3657
3658                         /* Result: number of initialized blocks past m_lblk */
3659                         allocated = map_len;
3660                 }
3661         }
3662         if (allocated) {
3663                 /* Mark the block containing both extents as dirty */
3664                 ext4_ext_dirty(handle, inode, path + depth);
3665
3666                 /* Update path to point to the right extent */
3667                 path[depth].p_ext = abut_ex;
3668                 goto out;
3669         } else
3670                 allocated = ee_len - (map->m_lblk - ee_block);
3671
3672         WARN_ON(map->m_lblk < ee_block);
3673         /*
3674          * It is safe to convert extent to initialized via explicit
3675          * zeroout only if extent is fully inside i_size or new_size.
3676          */
3677         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3678
3679         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3680                 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3681                         (inode->i_sb->s_blocksize_bits - 10);
3682
3683         /*
3684          * five cases:
3685          * 1. split the extent into three extents.
3686          * 2. split the extent into two extents, zeroout the head of the first
3687          *    extent.
3688          * 3. split the extent into two extents, zeroout the tail of the second
3689          *    extent.
3690          * 4. split the extent into two extents with out zeroout.
3691          * 5. no splitting needed, just possibly zeroout the head and / or the
3692          *    tail of the extent.
3693          */
3694         split_map.m_lblk = map->m_lblk;
3695         split_map.m_len = map->m_len;
3696
3697         if (max_zeroout && (allocated > split_map.m_len)) {
3698                 if (allocated <= max_zeroout) {
3699                         /* case 3 or 5 */
3700                         zero_ex1.ee_block =
3701                                  cpu_to_le32(split_map.m_lblk +
3702                                              split_map.m_len);
3703                         zero_ex1.ee_len =
3704                                 cpu_to_le16(allocated - split_map.m_len);
3705                         ext4_ext_store_pblock(&zero_ex1,
3706                                 ext4_ext_pblock(ex) + split_map.m_lblk +
3707                                 split_map.m_len - ee_block);
3708                         err = ext4_ext_zeroout(inode, &zero_ex1);
3709                         if (err)
3710                                 goto out;
3711                         split_map.m_len = allocated;
3712                 }
3713                 if (split_map.m_lblk - ee_block + split_map.m_len <
3714                                                                 max_zeroout) {
3715                         /* case 2 or 5 */
3716                         if (split_map.m_lblk != ee_block) {
3717                                 zero_ex2.ee_block = ex->ee_block;
3718                                 zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3719                                                         ee_block);
3720                                 ext4_ext_store_pblock(&zero_ex2,
3721                                                       ext4_ext_pblock(ex));
3722                                 err = ext4_ext_zeroout(inode, &zero_ex2);
3723                                 if (err)
3724                                         goto out;
3725                         }
3726
3727                         split_map.m_len += split_map.m_lblk - ee_block;
3728                         split_map.m_lblk = ee_block;
3729                         allocated = map->m_len;
3730                 }
3731         }
3732
3733         err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3734                                 flags);
3735         if (err > 0)
3736                 err = 0;
3737 out:
3738         /* If we have gotten a failure, don't zero out status tree */
3739         if (!err) {
3740                 err = ext4_zeroout_es(inode, &zero_ex1);
3741                 if (!err)
3742                         err = ext4_zeroout_es(inode, &zero_ex2);
3743         }
3744         return err ? err : allocated;
3745 }
3746
3747 /*
3748  * This function is called by ext4_ext_map_blocks() from
3749  * ext4_get_blocks_dio_write() when DIO to write
3750  * to an unwritten extent.
3751  *
3752  * Writing to an unwritten extent may result in splitting the unwritten
3753  * extent into multiple initialized/unwritten extents (up to three)
3754  * There are three possibilities:
3755  *   a> There is no split required: Entire extent should be unwritten
3756  *   b> Splits in two extents: Write is happening at either end of the extent
3757  *   c> Splits in three extents: Somone is writing in middle of the extent
3758  *
3759  * This works the same way in the case of initialized -> unwritten conversion.
3760  *
3761  * One of more index blocks maybe needed if the extent tree grow after
3762  * the unwritten extent split. To prevent ENOSPC occur at the IO
3763  * complete, we need to split the unwritten extent before DIO submit
3764  * the IO. The unwritten extent called at this time will be split
3765  * into three unwritten extent(at most). After IO complete, the part
3766  * being filled will be convert to initialized by the end_io callback function
3767  * via ext4_convert_unwritten_extents().
3768  *
3769  * Returns the size of unwritten extent to be written on success.
3770  */
3771 static int ext4_split_convert_extents(handle_t *handle,
3772                                         struct inode *inode,
3773                                         struct ext4_map_blocks *map,
3774                                         struct ext4_ext_path **ppath,
3775                                         int flags)
3776 {
3777         struct ext4_ext_path *path = *ppath;
3778         ext4_lblk_t eof_block;
3779         ext4_lblk_t ee_block;
3780         struct ext4_extent *ex;
3781         unsigned int ee_len;
3782         int split_flag = 0, depth;
3783
3784         ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3785                   __func__, inode->i_ino,
3786                   (unsigned long long)map->m_lblk, map->m_len);
3787
3788         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3789                 inode->i_sb->s_blocksize_bits;
3790         if (eof_block < map->m_lblk + map->m_len)
3791                 eof_block = map->m_lblk + map->m_len;
3792         /*
3793          * It is safe to convert extent to initialized via explicit
3794          * zeroout only if extent is fully insde i_size or new_size.
3795          */
3796         depth = ext_depth(inode);
3797         ex = path[depth].p_ext;
3798         ee_block = le32_to_cpu(ex->ee_block);
3799         ee_len = ext4_ext_get_actual_len(ex);
3800
3801         /* Convert to unwritten */
3802         if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3803                 split_flag |= EXT4_EXT_DATA_VALID1;
3804         /* Convert to initialized */
3805         } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3806                 split_flag |= ee_block + ee_len <= eof_block ?
3807                               EXT4_EXT_MAY_ZEROOUT : 0;
3808                 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3809         }
3810         flags |= EXT4_GET_BLOCKS_PRE_IO;
3811         return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3812 }
3813
3814 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3815                                                 struct inode *inode,
3816                                                 struct ext4_map_blocks *map,
3817                                                 struct ext4_ext_path **ppath)
3818 {
3819         struct ext4_ext_path *path = *ppath;
3820         struct ext4_extent *ex;
3821         ext4_lblk_t ee_block;
3822         unsigned int ee_len;
3823         int depth;
3824         int err = 0;
3825
3826         depth = ext_depth(inode);
3827         ex = path[depth].p_ext;
3828         ee_block = le32_to_cpu(ex->ee_block);
3829         ee_len = ext4_ext_get_actual_len(ex);
3830
3831         ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3832                 "block %llu, max_blocks %u\n", inode->i_ino,
3833                   (unsigned long long)ee_block, ee_len);
3834
3835         /* If extent is larger than requested it is a clear sign that we still
3836          * have some extent state machine issues left. So extent_split is still
3837          * required.
3838          * TODO: Once all related issues will be fixed this situation should be
3839          * illegal.
3840          */
3841         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3842 #ifdef CONFIG_EXT4_DEBUG
3843                 ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3844                              " len %u; IO logical block %llu, len %u",
3845                              inode->i_ino, (unsigned long long)ee_block, ee_len,
3846                              (unsigned long long)map->m_lblk, map->m_len);
3847 #endif
3848                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3849                                                  EXT4_GET_BLOCKS_CONVERT);
3850                 if (err < 0)
3851                         return err;
3852                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3853                 if (IS_ERR(path))
3854                         return PTR_ERR(path);
3855                 depth = ext_depth(inode);
3856                 ex = path[depth].p_ext;
3857         }
3858
3859         err = ext4_ext_get_access(handle, inode, path + depth);
3860         if (err)
3861                 goto out;
3862         /* first mark the extent as initialized */
3863         ext4_ext_mark_initialized(ex);
3864
3865         /* note: ext4_ext_correct_indexes() isn't needed here because
3866          * borders are not changed
3867          */
3868         ext4_ext_try_to_merge(handle, inode, path, ex);
3869
3870         /* Mark modified extent as dirty */
3871         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3872 out:
3873         ext4_ext_show_leaf(inode, path);
3874         return err;
3875 }
3876
3877 /*
3878  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3879  */
3880 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3881                               ext4_lblk_t lblk,
3882                               struct ext4_ext_path *path,
3883                               unsigned int len)
3884 {
3885         int i, depth;
3886         struct ext4_extent_header *eh;
3887         struct ext4_extent *last_ex;
3888
3889         if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3890                 return 0;
3891
3892         depth = ext_depth(inode);
3893         eh = path[depth].p_hdr;
3894
3895         /*
3896          * We're going to remove EOFBLOCKS_FL entirely in future so we
3897          * do not care for this case anymore. Simply remove the flag
3898          * if there are no extents.
3899          */
3900         if (unlikely(!eh->eh_entries))
3901                 goto out;
3902         last_ex = EXT_LAST_EXTENT(eh);
3903         /*
3904          * We should clear the EOFBLOCKS_FL flag if we are writing the
3905          * last block in the last extent in the file.  We test this by
3906          * first checking to see if the caller to
3907          * ext4_ext_get_blocks() was interested in the last block (or
3908          * a block beyond the last block) in the current extent.  If
3909          * this turns out to be false, we can bail out from this
3910          * function immediately.
3911          */
3912         if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3913             ext4_ext_get_actual_len(last_ex))
3914                 return 0;
3915         /*
3916          * If the caller does appear to be planning to write at or
3917          * beyond the end of the current extent, we then test to see
3918          * if the current extent is the last extent in the file, by
3919          * checking to make sure it was reached via the rightmost node
3920          * at each level of the tree.
3921          */
3922         for (i = depth-1; i >= 0; i--)
3923                 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3924                         return 0;
3925 out:
3926         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3927         return ext4_mark_inode_dirty(handle, inode);
3928 }
3929
3930 static int
3931 convert_initialized_extent(handle_t *handle, struct inode *inode,
3932                            struct ext4_map_blocks *map,
3933                            struct ext4_ext_path **ppath,
3934                            unsigned int allocated)
3935 {
3936         struct ext4_ext_path *path = *ppath;
3937         struct ext4_extent *ex;
3938         ext4_lblk_t ee_block;
3939         unsigned int ee_len;
3940         int depth;
3941         int err = 0;
3942
3943         /*
3944          * Make sure that the extent is no bigger than we support with
3945          * unwritten extent
3946          */
3947         if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3948                 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3949
3950         depth = ext_depth(inode);
3951         ex = path[depth].p_ext;
3952         ee_block = le32_to_cpu(ex->ee_block);
3953         ee_len = ext4_ext_get_actual_len(ex);
3954
3955         ext_debug("%s: inode %lu, logical"
3956                 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3957                   (unsigned long long)ee_block, ee_len);
3958
3959         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3960                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3961                                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3962                 if (err < 0)
3963                         return err;
3964                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3965                 if (IS_ERR(path))
3966                         return PTR_ERR(path);
3967                 depth = ext_depth(inode);
3968                 ex = path[depth].p_ext;
3969                 if (!ex) {
3970                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3971                                          (unsigned long) map->m_lblk);
3972                         return -EFSCORRUPTED;
3973                 }
3974         }
3975
3976         err = ext4_ext_get_access(handle, inode, path + depth);
3977         if (err)
3978                 return err;
3979         /* first mark the extent as unwritten */
3980         ext4_ext_mark_unwritten(ex);
3981
3982         /* note: ext4_ext_correct_indexes() isn't needed here because
3983          * borders are not changed
3984          */
3985         ext4_ext_try_to_merge(handle, inode, path, ex);
3986
3987         /* Mark modified extent as dirty */
3988         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3989         if (err)
3990                 return err;
3991         ext4_ext_show_leaf(inode, path);
3992
3993         ext4_update_inode_fsync_trans(handle, inode, 1);
3994         err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
3995         if (err)
3996                 return err;
3997         map->m_flags |= EXT4_MAP_UNWRITTEN;
3998         if (allocated > map->m_len)
3999                 allocated = map->m_len;
4000         map->m_len = allocated;
4001         return allocated;
4002 }
4003
4004 static int
4005 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4006                         struct ext4_map_blocks *map,
4007                         struct ext4_ext_path **ppath, int flags,
4008                         unsigned int allocated, ext4_fsblk_t newblock)
4009 {
4010         struct ext4_ext_path *path = *ppath;
4011         int ret = 0;
4012         int err = 0;
4013
4014         ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4015                   "block %llu, max_blocks %u, flags %x, allocated %u\n",
4016                   inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4017                   flags, allocated);
4018         ext4_ext_show_leaf(inode, path);
4019
4020         /*
4021          * When writing into unwritten space, we should not fail to
4022          * allocate metadata blocks for the new extent block if needed.
4023          */
4024         flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4025
4026         trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4027                                                     allocated, newblock);
4028
4029         /* get_block() before submit the IO, split the extent */
4030         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4031                 ret = ext4_split_convert_extents(handle, inode, map, ppath,
4032                                          flags | EXT4_GET_BLOCKS_CONVERT);
4033                 if (ret <= 0)
4034                         goto out;
4035                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4036                 goto out;
4037         }
4038         /* IO end_io complete, convert the filled extent to written */
4039         if (flags & EXT4_GET_BLOCKS_CONVERT) {
4040                 if (flags & EXT4_GET_BLOCKS_ZERO) {
4041                         if (allocated > map->m_len)
4042                                 allocated = map->m_len;
4043                         err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
4044                                                  allocated);
4045                         if (err < 0)
4046                                 goto out2;
4047                 }
4048                 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4049                                                            ppath);
4050                 if (ret >= 0) {
4051                         ext4_update_inode_fsync_trans(handle, inode, 1);
4052                         err = check_eofblocks_fl(handle, inode, map->m_lblk,
4053                                                  path, map->m_len);
4054                 } else
4055                         err = ret;
4056                 map->m_flags |= EXT4_MAP_MAPPED;
4057                 map->m_pblk = newblock;
4058                 if (allocated > map->m_len)
4059                         allocated = map->m_len;
4060                 map->m_len = allocated;
4061                 goto out2;
4062         }
4063         /* buffered IO case */
4064         /*
4065          * repeat fallocate creation request
4066          * we already have an unwritten extent
4067          */
4068         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4069                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4070                 goto map_out;
4071         }
4072
4073         /* buffered READ or buffered write_begin() lookup */
4074         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4075                 /*
4076                  * We have blocks reserved already.  We
4077                  * return allocated blocks so that delalloc
4078                  * won't do block reservation for us.  But
4079                  * the buffer head will be unmapped so that
4080                  * a read from the block returns 0s.
4081                  */
4082                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4083                 goto out1;
4084         }
4085
4086         /* buffered write, writepage time, convert*/
4087         ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4088         if (ret >= 0)
4089                 ext4_update_inode_fsync_trans(handle, inode, 1);
4090 out:
4091         if (ret <= 0) {
4092                 err = ret;
4093                 goto out2;
4094         } else
4095                 allocated = ret;
4096         map->m_flags |= EXT4_MAP_NEW;
4097         if (allocated > map->m_len)
4098                 allocated = map->m_len;
4099         map->m_len = allocated;
4100
4101 map_out:
4102         map->m_flags |= EXT4_MAP_MAPPED;
4103         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4104                 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4105                                          map->m_len);
4106                 if (err < 0)
4107                         goto out2;
4108         }
4109 out1:
4110         if (allocated > map->m_len)
4111                 allocated = map->m_len;
4112         ext4_ext_show_leaf(inode, path);
4113         map->m_pblk = newblock;
4114         map->m_len = allocated;
4115 out2:
4116         return err ? err : allocated;
4117 }
4118
4119 /*
4120  * get_implied_cluster_alloc - check to see if the requested
4121  * allocation (in the map structure) overlaps with a cluster already
4122  * allocated in an extent.
4123  *      @sb     The filesystem superblock structure
4124  *      @map    The requested lblk->pblk mapping
4125  *      @ex     The extent structure which might contain an implied
4126  *                      cluster allocation
4127  *
4128  * This function is called by ext4_ext_map_blocks() after we failed to
4129  * find blocks that were already in the inode's extent tree.  Hence,
4130  * we know that the beginning of the requested region cannot overlap
4131  * the extent from the inode's extent tree.  There are three cases we
4132  * want to catch.  The first is this case:
4133  *
4134  *               |--- cluster # N--|
4135  *    |--- extent ---|  |---- requested region ---|
4136  *                      |==========|
4137  *
4138  * The second case that we need to test for is this one:
4139  *
4140  *   |--------- cluster # N ----------------|
4141  *         |--- requested region --|   |------- extent ----|
4142  *         |=======================|
4143  *
4144  * The third case is when the requested region lies between two extents
4145  * within the same cluster:
4146  *          |------------- cluster # N-------------|
4147  * |----- ex -----|                  |---- ex_right ----|
4148  *                  |------ requested region ------|
4149  *                  |================|
4150  *
4151  * In each of the above cases, we need to set the map->m_pblk and
4152  * map->m_len so it corresponds to the return the extent labelled as
4153  * "|====|" from cluster #N, since it is already in use for data in
4154  * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4155  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4156  * as a new "allocated" block region.  Otherwise, we will return 0 and
4157  * ext4_ext_map_blocks() will then allocate one or more new clusters
4158  * by calling ext4_mb_new_blocks().
4159  */
4160 static int get_implied_cluster_alloc(struct super_block *sb,
4161                                      struct ext4_map_blocks *map,
4162                                      struct ext4_extent *ex,
4163                                      struct ext4_ext_path *path)
4164 {
4165         struct ext4_sb_info *sbi = EXT4_SB(sb);
4166         ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4167         ext4_lblk_t ex_cluster_start, ex_cluster_end;
4168         ext4_lblk_t rr_cluster_start;
4169         ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4170         ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4171         unsigned short ee_len = ext4_ext_get_actual_len(ex);
4172
4173         /* The extent passed in that we are trying to match */
4174         ex_cluster_start = EXT4_B2C(sbi, ee_block);
4175         ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4176
4177         /* The requested region passed into ext4_map_blocks() */
4178         rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4179
4180         if ((rr_cluster_start == ex_cluster_end) ||
4181             (rr_cluster_start == ex_cluster_start)) {
4182                 if (rr_cluster_start == ex_cluster_end)
4183                         ee_start += ee_len - 1;
4184                 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4185                 map->m_len = min(map->m_len,
4186                                  (unsigned) sbi->s_cluster_ratio - c_offset);
4187                 /*
4188                  * Check for and handle this case:
4189                  *
4190                  *   |--------- cluster # N-------------|
4191                  *                     |------- extent ----|
4192                  *         |--- requested region ---|
4193                  *         |===========|
4194                  */
4195
4196                 if (map->m_lblk < ee_block)
4197                         map->m_len = min(map->m_len, ee_block - map->m_lblk);
4198
4199                 /*
4200                  * Check for the case where there is already another allocated
4201                  * block to the right of 'ex' but before the end of the cluster.
4202                  *
4203                  *          |------------- cluster # N-------------|
4204                  * |----- ex -----|                  |---- ex_right ----|
4205                  *                  |------ requested region ------|
4206                  *                  |================|
4207                  */
4208                 if (map->m_lblk > ee_block) {
4209                         ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4210                         map->m_len = min(map->m_len, next - map->m_lblk);
4211                 }
4212
4213                 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4214                 return 1;
4215         }
4216
4217         trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4218         return 0;
4219 }
4220
4221
4222 /*
4223  * Block allocation/map/preallocation routine for extents based files
4224  *
4225  *
4226  * Need to be called with
4227  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4228  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4229  *
4230  * return > 0, number of of blocks already mapped/allocated
4231  *          if create == 0 and these are pre-allocated blocks
4232  *              buffer head is unmapped
4233  *          otherwise blocks are mapped
4234  *
4235  * return = 0, if plain look up failed (blocks have not been allocated)
4236  *          buffer head is unmapped
4237  *
4238  * return < 0, error case.
4239  */
4240 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4241                         struct ext4_map_blocks *map, int flags)
4242 {
4243         struct ext4_ext_path *path = NULL;
4244         struct ext4_extent newex, *ex, *ex2;
4245         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4246         ext4_fsblk_t newblock = 0;
4247         int free_on_err = 0, err = 0, depth, ret;
4248         unsigned int allocated = 0, offset = 0;
4249         unsigned int allocated_clusters = 0;
4250         struct ext4_allocation_request ar;
4251         ext4_lblk_t cluster_offset;
4252         bool map_from_cluster = false;
4253
4254         ext_debug("blocks %u/%u requested for inode %lu\n",
4255                   map->m_lblk, map->m_len, inode->i_ino);
4256         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4257
4258         /* find extent for this block */
4259         path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4260         if (IS_ERR(path)) {
4261                 err = PTR_ERR(path);
4262                 path = NULL;
4263                 goto out2;
4264         }
4265
4266         depth = ext_depth(inode);
4267
4268         /*
4269          * consistent leaf must not be empty;
4270          * this situation is possible, though, _during_ tree modification;
4271          * this is why assert can't be put in ext4_find_extent()
4272          */
4273         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4274                 EXT4_ERROR_INODE(inode, "bad extent address "
4275                                  "lblock: %lu, depth: %d pblock %lld",
4276                                  (unsigned long) map->m_lblk, depth,
4277                                  path[depth].p_block);
4278                 err = -EFSCORRUPTED;
4279                 goto out2;
4280         }
4281
4282         ex = path[depth].p_ext;
4283         if (ex) {
4284                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4285                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4286                 unsigned short ee_len;
4287
4288
4289                 /*
4290                  * unwritten extents are treated as holes, except that
4291                  * we split out initialized portions during a write.
4292                  */
4293                 ee_len = ext4_ext_get_actual_len(ex);
4294
4295                 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4296
4297                 /* if found extent covers block, simply return it */
4298                 if (in_range(map->m_lblk, ee_block, ee_len)) {
4299                         newblock = map->m_lblk - ee_block + ee_start;
4300                         /* number of remaining blocks in the extent */
4301                         allocated = ee_len - (map->m_lblk - ee_block);
4302                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4303                                   ee_block, ee_len, newblock);
4304
4305                         /*
4306                          * If the extent is initialized check whether the
4307                          * caller wants to convert it to unwritten.
4308                          */
4309                         if ((!ext4_ext_is_unwritten(ex)) &&
4310                             (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4311                                 allocated = convert_initialized_extent(
4312                                                 handle, inode, map, &path,
4313                                                 allocated);
4314                                 goto out2;
4315                         } else if (!ext4_ext_is_unwritten(ex))
4316                                 goto out;
4317
4318                         ret = ext4_ext_handle_unwritten_extents(
4319                                 handle, inode, map, &path, flags,
4320                                 allocated, newblock);
4321                         if (ret < 0)
4322                                 err = ret;
4323                         else
4324                                 allocated = ret;
4325                         goto out2;
4326                 }
4327         }
4328
4329         /*
4330          * requested block isn't allocated yet;
4331          * we couldn't try to create block if create flag is zero
4332          */
4333         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4334                 ext4_lblk_t hole_start, hole_len;
4335
4336                 hole_start = map->m_lblk;
4337                 hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4338                 /*
4339                  * put just found gap into cache to speed up
4340                  * subsequent requests
4341                  */
4342                 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4343
4344                 /* Update hole_len to reflect hole size after map->m_lblk */
4345                 if (hole_start != map->m_lblk)
4346                         hole_len -= map->m_lblk - hole_start;
4347                 map->m_pblk = 0;
4348                 map->m_len = min_t(unsigned int, map->m_len, hole_len);
4349
4350                 goto out2;
4351         }
4352
4353         /*
4354          * Okay, we need to do block allocation.
4355          */
4356         newex.ee_block = cpu_to_le32(map->m_lblk);
4357         cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4358
4359         /*
4360          * If we are doing bigalloc, check to see if the extent returned
4361          * by ext4_find_extent() implies a cluster we can use.
4362          */
4363         if (cluster_offset && ex &&
4364             get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4365                 ar.len = allocated = map->m_len;
4366                 newblock = map->m_pblk;
4367                 map_from_cluster = true;
4368                 goto got_allocated_blocks;
4369         }
4370
4371         /* find neighbour allocated blocks */
4372         ar.lleft = map->m_lblk;
4373         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4374         if (err)
4375                 goto out2;
4376         ar.lright = map->m_lblk;
4377         ex2 = NULL;
4378         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4379         if (err)
4380                 goto out2;
4381
4382         /* Check if the extent after searching to the right implies a
4383          * cluster we can use. */
4384         if ((sbi->s_cluster_ratio > 1) && ex2 &&
4385             get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4386                 ar.len = allocated = map->m_len;
4387                 newblock = map->m_pblk;
4388                 map_from_cluster = true;
4389                 goto got_allocated_blocks;
4390         }
4391
4392         /*
4393          * See if request is beyond maximum number of blocks we can have in
4394          * a single extent. For an initialized extent this limit is
4395          * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4396          * EXT_UNWRITTEN_MAX_LEN.
4397          */
4398         if (map->m_len > EXT_INIT_MAX_LEN &&
4399             !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4400                 map->m_len = EXT_INIT_MAX_LEN;
4401         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4402                  (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4403                 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4404
4405         /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4406         newex.ee_len = cpu_to_le16(map->m_len);
4407         err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4408         if (err)
4409                 allocated = ext4_ext_get_actual_len(&newex);
4410         else
4411                 allocated = map->m_len;
4412
4413         /* allocate new block */
4414         ar.inode = inode;
4415         ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4416         ar.logical = map->m_lblk;
4417         /*
4418          * We calculate the offset from the beginning of the cluster
4419          * for the logical block number, since when we allocate a
4420          * physical cluster, the physical block should start at the
4421          * same offset from the beginning of the cluster.  This is
4422          * needed so that future calls to get_implied_cluster_alloc()
4423          * work correctly.
4424          */
4425         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4426         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4427         ar.goal -= offset;
4428         ar.logical -= offset;
4429         if (S_ISREG(inode->i_mode))
4430                 ar.flags = EXT4_MB_HINT_DATA;
4431         else
4432                 /* disable in-core preallocation for non-regular files */
4433                 ar.flags = 0;
4434         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4435                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4436         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4437                 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4438         if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4439                 ar.flags |= EXT4_MB_USE_RESERVED;
4440         newblock = ext4_mb_new_blocks(handle, &ar, &err);
4441         if (!newblock)
4442                 goto out2;
4443         ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4444                   ar.goal, newblock, allocated);
4445         free_on_err = 1;
4446         allocated_clusters = ar.len;
4447         ar.len = EXT4_C2B(sbi, ar.len) - offset;
4448         if (ar.len > allocated)
4449                 ar.len = allocated;
4450
4451 got_allocated_blocks:
4452         /* try to insert new extent into found leaf and return */
4453         ext4_ext_store_pblock(&newex, newblock + offset);
4454         newex.ee_len = cpu_to_le16(ar.len);
4455         /* Mark unwritten */
4456         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4457                 ext4_ext_mark_unwritten(&newex);
4458                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4459         }
4460
4461         err = 0;
4462         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4463                 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4464                                          path, ar.len);
4465         if (!err)
4466                 err = ext4_ext_insert_extent(handle, inode, &path,
4467                                              &newex, flags);
4468
4469         if (err && free_on_err) {
4470                 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4471                         EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4472                 /* free data blocks we just allocated */
4473                 /* not a good idea to call discard here directly,
4474                  * but otherwise we'd need to call it every free() */
4475                 ext4_discard_preallocations(inode);
4476                 ext4_free_blocks(handle, inode, NULL, newblock,
4477                                  EXT4_C2B(sbi, allocated_clusters), fb_flags);
4478                 goto out2;
4479         }
4480
4481         /* previous routine could use block we allocated */
4482         newblock = ext4_ext_pblock(&newex);
4483         allocated = ext4_ext_get_actual_len(&newex);
4484         if (allocated > map->m_len)
4485                 allocated = map->m_len;
4486         map->m_flags |= EXT4_MAP_NEW;
4487
4488         /*
4489          * Reduce the reserved cluster count to reflect successful deferred
4490          * allocation of delayed allocated clusters or direct allocation of
4491          * clusters discovered to be delayed allocated.  Once allocated, a
4492          * cluster is not included in the reserved count.
4493          */
4494         if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
4495                 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4496                         /*
4497                          * When allocating delayed allocated clusters, simply
4498                          * reduce the reserved cluster count and claim quota
4499                          */
4500                         ext4_da_update_reserve_space(inode, allocated_clusters,
4501                                                         1);
4502                 } else {
4503                         ext4_lblk_t lblk, len;
4504                         unsigned int n;
4505
4506                         /*
4507                          * When allocating non-delayed allocated clusters
4508                          * (from fallocate, filemap, DIO, or clusters
4509                          * allocated when delalloc has been disabled by
4510                          * ext4_nonda_switch), reduce the reserved cluster
4511                          * count by the number of allocated clusters that
4512                          * have previously been delayed allocated.  Quota
4513                          * has been claimed by ext4_mb_new_blocks() above,
4514                          * so release the quota reservations made for any
4515                          * previously delayed allocated clusters.
4516                          */
4517                         lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4518                         len = allocated_clusters << sbi->s_cluster_bits;
4519                         n = ext4_es_delayed_clu(inode, lblk, len);
4520                         if (n > 0)
4521                                 ext4_da_update_reserve_space(inode, (int) n, 0);
4522                 }
4523         }
4524
4525         /*
4526          * Cache the extent and update transaction to commit on fdatasync only
4527          * when it is _not_ an unwritten extent.
4528          */
4529         if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4530                 ext4_update_inode_fsync_trans(handle, inode, 1);
4531         else
4532                 ext4_update_inode_fsync_trans(handle, inode, 0);
4533 out:
4534         if (allocated > map->m_len)
4535                 allocated = map->m_len;
4536         ext4_ext_show_leaf(inode, path);
4537         map->m_flags |= EXT4_MAP_MAPPED;
4538         map->m_pblk = newblock;
4539         map->m_len = allocated;
4540 out2:
4541         ext4_ext_drop_refs(path);
4542         kfree(path);
4543
4544         trace_ext4_ext_map_blocks_exit(inode, flags, map,
4545                                        err ? err : allocated);
4546         return err ? err : allocated;
4547 }
4548
4549 int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4550 {
4551         struct super_block *sb = inode->i_sb;
4552         ext4_lblk_t last_block;
4553         int err = 0;
4554
4555         /*
4556          * TODO: optimization is possible here.
4557          * Probably we need not scan at all,
4558          * because page truncation is enough.
4559          */
4560
4561         /* we have to know where to truncate from in crash case */
4562         EXT4_I(inode)->i_disksize = inode->i_size;
4563         err = ext4_mark_inode_dirty(handle, inode);
4564         if (err)
4565                 return err;
4566
4567         last_block = (inode->i_size + sb->s_blocksize - 1)
4568                         >> EXT4_BLOCK_SIZE_BITS(sb);
4569 retry:
4570         err = ext4_es_remove_extent(inode, last_block,
4571                                     EXT_MAX_BLOCKS - last_block);
4572         if (err == -ENOMEM) {
4573                 cond_resched();
4574                 congestion_wait(BLK_RW_ASYNC, HZ/50);
4575                 goto retry;
4576         }
4577         if (err)
4578                 return err;
4579         return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4580 }
4581
4582 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4583                                   ext4_lblk_t len, loff_t new_size,
4584                                   int flags)
4585 {
4586         struct inode *inode = file_inode(file);
4587         handle_t *handle;
4588         int ret = 0;
4589         int ret2 = 0;
4590         int retries = 0;
4591         int depth = 0;
4592         struct ext4_map_blocks map;
4593         unsigned int credits;
4594         loff_t epos;
4595
4596         BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4597         map.m_lblk = offset;
4598         map.m_len = len;
4599         /*
4600          * Don't normalize the request if it can fit in one extent so
4601          * that it doesn't get unnecessarily split into multiple
4602          * extents.
4603          */
4604         if (len <= EXT_UNWRITTEN_MAX_LEN)
4605                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4606
4607         /*
4608          * credits to insert 1 extent into extent tree
4609          */
4610         credits = ext4_chunk_trans_blocks(inode, len);
4611         depth = ext_depth(inode);
4612
4613 retry:
4614         while (ret >= 0 && len) {
4615                 /*
4616                  * Recalculate credits when extent tree depth changes.
4617                  */
4618                 if (depth != ext_depth(inode)) {
4619                         credits = ext4_chunk_trans_blocks(inode, len);
4620                         depth = ext_depth(inode);
4621                 }
4622
4623                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4624                                             credits);
4625                 if (IS_ERR(handle)) {
4626                         ret = PTR_ERR(handle);
4627                         break;
4628                 }
4629                 ret = ext4_map_blocks(handle, inode, &map, flags);
4630                 if (ret <= 0) {
4631                         ext4_debug("inode #%lu: block %u: len %u: "
4632                                    "ext4_ext_map_blocks returned %d",
4633                                    inode->i_ino, map.m_lblk,
4634                                    map.m_len, ret);
4635                         ext4_mark_inode_dirty(handle, inode);
4636                         ret2 = ext4_journal_stop(handle);
4637                         break;
4638                 }
4639                 map.m_lblk += ret;
4640                 map.m_len = len = len - ret;
4641                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4642                 inode->i_ctime = current_time(inode);
4643                 if (new_size) {
4644                         if (epos > new_size)
4645                                 epos = new_size;
4646                         if (ext4_update_inode_size(inode, epos) & 0x1)
4647                                 inode->i_mtime = inode->i_ctime;
4648                 } else {
4649                         if (epos > inode->i_size)
4650                                 ext4_set_inode_flag(inode,
4651                                                     EXT4_INODE_EOFBLOCKS);
4652                 }
4653                 ext4_mark_inode_dirty(handle, inode);
4654                 ext4_update_inode_fsync_trans(handle, inode, 1);
4655                 ret2 = ext4_journal_stop(handle);
4656                 if (ret2)
4657                         break;
4658         }
4659         if (ret == -ENOSPC &&
4660                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
4661                 ret = 0;
4662                 goto retry;
4663         }
4664
4665         return ret > 0 ? ret2 : ret;
4666 }
4667
4668 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
4669
4670 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
4671
4672 static long ext4_zero_range(struct file *file, loff_t offset,
4673                             loff_t len, int mode)
4674 {
4675         struct inode *inode = file_inode(file);
4676         handle_t *handle = NULL;
4677         unsigned int max_blocks;
4678         loff_t new_size = 0;
4679         int ret = 0;
4680         int flags;
4681         int credits;
4682         int partial_begin, partial_end;
4683         loff_t start, end;
4684         ext4_lblk_t lblk;
4685         unsigned int blkbits = inode->i_blkbits;
4686
4687         trace_ext4_zero_range(inode, offset, len, mode);
4688
4689         /* Call ext4_force_commit to flush all data in case of data=journal. */
4690         if (ext4_should_journal_data(inode)) {
4691                 ret = ext4_force_commit(inode->i_sb);
4692                 if (ret)
4693                         return ret;
4694         }
4695
4696         /*
4697          * Round up offset. This is not fallocate, we neet to zero out
4698          * blocks, so convert interior block aligned part of the range to
4699          * unwritten and possibly manually zero out unaligned parts of the
4700          * range.
4701          */
4702         start = round_up(offset, 1 << blkbits);
4703         end = round_down((offset + len), 1 << blkbits);
4704
4705         if (start < offset || end > offset + len)
4706                 return -EINVAL;
4707         partial_begin = offset & ((1 << blkbits) - 1);
4708         partial_end = (offset + len) & ((1 << blkbits) - 1);
4709
4710         lblk = start >> blkbits;
4711         max_blocks = (end >> blkbits);
4712         if (max_blocks < lblk)
4713                 max_blocks = 0;
4714         else
4715                 max_blocks -= lblk;
4716
4717         inode_lock(inode);
4718
4719         /*
4720          * Indirect files do not support unwritten extnets
4721          */
4722         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4723                 ret = -EOPNOTSUPP;
4724                 goto out_mutex;
4725         }
4726
4727         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4728             (offset + len > inode->i_size ||
4729              offset + len > EXT4_I(inode)->i_disksize)) {
4730                 new_size = offset + len;
4731                 ret = inode_newsize_ok(inode, new_size);
4732                 if (ret)
4733                         goto out_mutex;
4734         }
4735
4736         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4737         if (mode & FALLOC_FL_KEEP_SIZE)
4738                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4739
4740         /* Wait all existing dio workers, newcomers will block on i_mutex */
4741         inode_dio_wait(inode);
4742
4743         /* Preallocate the range including the unaligned edges */
4744         if (partial_begin || partial_end) {
4745                 ret = ext4_alloc_file_blocks(file,
4746                                 round_down(offset, 1 << blkbits) >> blkbits,
4747                                 (round_up((offset + len), 1 << blkbits) -
4748                                  round_down(offset, 1 << blkbits)) >> blkbits,
4749                                 new_size, flags);
4750                 if (ret)
4751                         goto out_mutex;
4752
4753         }
4754
4755         /* Zero range excluding the unaligned edges */
4756         if (max_blocks > 0) {
4757                 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4758                           EXT4_EX_NOCACHE);
4759
4760                 /*
4761                  * Prevent page faults from reinstantiating pages we have
4762                  * released from page cache.
4763                  */
4764                 down_write(&EXT4_I(inode)->i_mmap_sem);
4765
4766                 ret = ext4_break_layouts(inode);
4767                 if (ret) {
4768                         up_write(&EXT4_I(inode)->i_mmap_sem);
4769                         goto out_mutex;
4770                 }
4771
4772                 ret = ext4_update_disksize_before_punch(inode, offset, len);
4773                 if (ret) {
4774                         up_write(&EXT4_I(inode)->i_mmap_sem);
4775                         goto out_mutex;
4776                 }
4777                 /* Now release the pages and zero block aligned part of pages */
4778                 truncate_pagecache_range(inode, start, end - 1);
4779                 inode->i_mtime = inode->i_ctime = current_time(inode);
4780
4781                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4782                                              flags);
4783                 up_write(&EXT4_I(inode)->i_mmap_sem);
4784                 if (ret)
4785                         goto out_mutex;
4786         }
4787         if (!partial_begin && !partial_end)
4788                 goto out_mutex;
4789
4790         /*
4791          * In worst case we have to writeout two nonadjacent unwritten
4792          * blocks and update the inode
4793          */
4794         credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4795         if (ext4_should_journal_data(inode))
4796                 credits += 2;
4797         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4798         if (IS_ERR(handle)) {
4799                 ret = PTR_ERR(handle);
4800                 ext4_std_error(inode->i_sb, ret);
4801                 goto out_mutex;
4802         }
4803
4804         inode->i_mtime = inode->i_ctime = current_time(inode);
4805         if (new_size) {
4806                 ext4_update_inode_size(inode, new_size);
4807         } else {
4808                 /*
4809                 * Mark that we allocate beyond EOF so the subsequent truncate
4810                 * can proceed even if the new size is the same as i_size.
4811                 */
4812                 if (offset + len > inode->i_size)
4813                         ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4814         }
4815         ext4_mark_inode_dirty(handle, inode);
4816
4817         /* Zero out partial block at the edges of the range */
4818         ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4819         if (ret >= 0)
4820                 ext4_update_inode_fsync_trans(handle, inode, 1);
4821
4822         if (file->f_flags & O_SYNC)
4823                 ext4_handle_sync(handle);
4824
4825         ext4_journal_stop(handle);
4826 out_mutex:
4827         inode_unlock(inode);
4828         return ret;
4829 }
4830
4831 /*
4832  * preallocate space for a file. This implements ext4's fallocate file
4833  * operation, which gets called from sys_fallocate system call.
4834  * For block-mapped files, posix_fallocate should fall back to the method
4835  * of writing zeroes to the required new blocks (the same behavior which is
4836  * expected for file systems which do not support fallocate() system call).
4837  */
4838 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4839 {
4840         struct inode *inode = file_inode(file);
4841         loff_t new_size = 0;
4842         unsigned int max_blocks;
4843         int ret = 0;
4844         int flags;
4845         ext4_lblk_t lblk;
4846         unsigned int blkbits = inode->i_blkbits;
4847
4848         /*
4849          * Encrypted inodes can't handle collapse range or insert
4850          * range since we would need to re-encrypt blocks with a
4851          * different IV or XTS tweak (which are based on the logical
4852          * block number).
4853          */
4854         if (IS_ENCRYPTED(inode) &&
4855             (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4856                 return -EOPNOTSUPP;
4857
4858         /* Return error if mode is not supported */
4859         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4860                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4861                      FALLOC_FL_INSERT_RANGE))
4862                 return -EOPNOTSUPP;
4863
4864         if (mode & FALLOC_FL_PUNCH_HOLE)
4865                 return ext4_punch_hole(inode, offset, len);
4866
4867         ret = ext4_convert_inline_data(inode);
4868         if (ret)
4869                 return ret;
4870
4871         if (mode & FALLOC_FL_COLLAPSE_RANGE)
4872                 return ext4_collapse_range(inode, offset, len);
4873
4874         if (mode & FALLOC_FL_INSERT_RANGE)
4875                 return ext4_insert_range(inode, offset, len);
4876
4877         if (mode & FALLOC_FL_ZERO_RANGE)
4878                 return ext4_zero_range(file, offset, len, mode);
4879
4880         trace_ext4_fallocate_enter(inode, offset, len, mode);
4881         lblk = offset >> blkbits;
4882
4883         max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4884         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4885         if (mode & FALLOC_FL_KEEP_SIZE)
4886                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4887
4888         inode_lock(inode);
4889
4890         /*
4891          * We only support preallocation for extent-based files only
4892          */
4893         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4894                 ret = -EOPNOTSUPP;
4895                 goto out;
4896         }
4897
4898         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4899             (offset + len > inode->i_size ||
4900              offset + len > EXT4_I(inode)->i_disksize)) {
4901                 new_size = offset + len;
4902                 ret = inode_newsize_ok(inode, new_size);
4903                 if (ret)
4904                         goto out;
4905         }
4906
4907         /* Wait all existing dio workers, newcomers will block on i_mutex */
4908         inode_dio_wait(inode);
4909
4910         ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4911         if (ret)
4912                 goto out;
4913
4914         if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4915                 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4916                                                 EXT4_I(inode)->i_sync_tid);
4917         }
4918 out:
4919         inode_unlock(inode);
4920         trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4921         return ret;
4922 }
4923
4924 /*
4925  * This function convert a range of blocks to written extents
4926  * The caller of this function will pass the start offset and the size.
4927  * all unwritten extents within this range will be converted to
4928  * written extents.
4929  *
4930  * This function is called from the direct IO end io call back
4931  * function, to convert the fallocated extents after IO is completed.
4932  * Returns 0 on success.
4933  */
4934 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4935                                    loff_t offset, ssize_t len)
4936 {
4937         unsigned int max_blocks;
4938         int ret = 0;
4939         int ret2 = 0;
4940         struct ext4_map_blocks map;
4941         unsigned int blkbits = inode->i_blkbits;
4942         unsigned int credits = 0;
4943
4944         map.m_lblk = offset >> blkbits;
4945         max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4946
4947         if (!handle) {
4948                 /*
4949                  * credits to insert 1 extent into extent tree
4950                  */
4951                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4952         }
4953         while (ret >= 0 && ret < max_blocks) {
4954                 map.m_lblk += ret;
4955                 map.m_len = (max_blocks -= ret);
4956                 if (credits) {
4957                         handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4958                                                     credits);
4959                         if (IS_ERR(handle)) {
4960                                 ret = PTR_ERR(handle);
4961                                 break;
4962                         }
4963                 }
4964                 ret = ext4_map_blocks(handle, inode, &map,
4965                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4966                 if (ret <= 0)
4967                         ext4_warning(inode->i_sb,
4968                                      "inode #%lu: block %u: len %u: "
4969                                      "ext4_ext_map_blocks returned %d",
4970                                      inode->i_ino, map.m_lblk,
4971                                      map.m_len, ret);
4972                 ext4_mark_inode_dirty(handle, inode);
4973                 if (credits)
4974                         ret2 = ext4_journal_stop(handle);
4975                 if (ret <= 0 || ret2)
4976                         break;
4977         }
4978         return ret > 0 ? ret2 : ret;
4979 }
4980
4981 int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4982 {
4983         int ret, err = 0;
4984         struct ext4_io_end_vec *io_end_vec;
4985
4986         /*
4987          * This is somewhat ugly but the idea is clear: When transaction is
4988          * reserved, everything goes into it. Otherwise we rather start several
4989          * smaller transactions for conversion of each extent separately.
4990          */
4991         if (handle) {
4992                 handle = ext4_journal_start_reserved(handle,
4993                                                      EXT4_HT_EXT_CONVERT);
4994                 if (IS_ERR(handle))
4995                         return PTR_ERR(handle);
4996         }
4997
4998         list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4999                 ret = ext4_convert_unwritten_extents(handle, io_end->inode,
5000                                                      io_end_vec->offset,
5001                                                      io_end_vec->size);
5002                 if (ret)
5003                         break;
5004         }
5005
5006         if (handle)
5007                 err = ext4_journal_stop(handle);
5008
5009         return ret < 0 ? ret : err;
5010 }
5011
5012 /*
5013  * If newes is not existing extent (newes->ec_pblk equals zero) find
5014  * delayed extent at start of newes and update newes accordingly and
5015  * return start of the next delayed extent.
5016  *
5017  * If newes is existing extent (newes->ec_pblk is not equal zero)
5018  * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5019  * extent found. Leave newes unmodified.
5020  */
5021 static int ext4_find_delayed_extent(struct inode *inode,
5022                                     struct extent_status *newes)
5023 {
5024         struct extent_status es;
5025         ext4_lblk_t block, next_del;
5026
5027         if (newes->es_pblk == 0) {
5028                 ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
5029                                           newes->es_lblk,
5030                                           newes->es_lblk + newes->es_len - 1,
5031                                           &es);
5032
5033                 /*
5034                  * No extent in extent-tree contains block @newes->es_pblk,
5035                  * then the block may stay in 1)a hole or 2)delayed-extent.
5036                  */
5037                 if (es.es_len == 0)
5038                         /* A hole found. */
5039                         return 0;
5040
5041                 if (es.es_lblk > newes->es_lblk) {
5042                         /* A hole found. */
5043                         newes->es_len = min(es.es_lblk - newes->es_lblk,
5044                                             newes->es_len);
5045                         return 0;
5046                 }
5047
5048                 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5049         }
5050
5051         block = newes->es_lblk + newes->es_len;
5052         ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
5053                                   EXT_MAX_BLOCKS, &es);
5054         if (es.es_len == 0)
5055                 next_del = EXT_MAX_BLOCKS;
5056         else
5057                 next_del = es.es_lblk;
5058
5059         return next_del;
5060 }
5061
5062 static int ext4_xattr_fiemap(struct inode *inode,
5063                                 struct fiemap_extent_info *fieinfo)
5064 {
5065         __u64 physical = 0;
5066         __u64 length;
5067         __u32 flags = FIEMAP_EXTENT_LAST;
5068         int blockbits = inode->i_sb->s_blocksize_bits;
5069         int error = 0;
5070
5071         /* in-inode? */
5072         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5073                 struct ext4_iloc iloc;
5074                 int offset;     /* offset of xattr in inode */
5075
5076                 error = ext4_get_inode_loc(inode, &iloc);
5077                 if (error)
5078                         return error;
5079                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
5080                 offset = EXT4_GOOD_OLD_INODE_SIZE +
5081                                 EXT4_I(inode)->i_extra_isize;
5082                 physical += offset;
5083                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5084                 flags |= FIEMAP_EXTENT_DATA_INLINE;
5085                 brelse(iloc.bh);
5086         } else { /* external block */
5087                 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5088                 length = inode->i_sb->s_blocksize;
5089         }
5090
5091         if (physical)
5092                 error = fiemap_fill_next_extent(fieinfo, 0, physical,
5093                                                 length, flags);
5094         return (error < 0 ? error : 0);
5095 }
5096
5097 static int _ext4_fiemap(struct inode *inode,
5098                         struct fiemap_extent_info *fieinfo,
5099                         __u64 start, __u64 len,
5100                         int (*fill)(struct inode *, ext4_lblk_t,
5101                                     ext4_lblk_t,
5102                                     struct fiemap_extent_info *))
5103 {
5104         ext4_lblk_t start_blk;
5105         u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
5106
5107         int error = 0;
5108
5109         if (ext4_has_inline_data(inode)) {
5110                 int has_inline = 1;
5111
5112                 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5113                                                 start, len);
5114
5115                 if (has_inline)
5116                         return error;
5117         }
5118
5119         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5120                 error = ext4_ext_precache(inode);
5121                 if (error)
5122                         return error;
5123                 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5124         }
5125
5126         /* fallback to generic here if not in extents fmt */
5127         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
5128             fill == ext4_fill_fiemap_extents)
5129                 return generic_block_fiemap(inode, fieinfo, start, len,
5130                         ext4_get_block);
5131
5132         if (fill == ext4_fill_es_cache_info)
5133                 ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
5134         if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
5135                 return -EBADR;
5136
5137         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5138                 error = ext4_xattr_fiemap(inode, fieinfo);
5139         } else {
5140                 ext4_lblk_t len_blks;
5141                 __u64 last_blk;
5142
5143                 start_blk = start >> inode->i_sb->s_blocksize_bits;
5144                 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5145                 if (last_blk >= EXT_MAX_BLOCKS)
5146                         last_blk = EXT_MAX_BLOCKS-1;
5147                 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5148
5149                 /*
5150                  * Walk the extent tree gathering extent information
5151                  * and pushing extents back to the user.
5152                  */
5153                 error = fill(inode, start_blk, len_blks, fieinfo);
5154         }
5155         return error;
5156 }
5157
5158 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5159                 __u64 start, __u64 len)
5160 {
5161         return _ext4_fiemap(inode, fieinfo, start, len,
5162                             ext4_fill_fiemap_extents);
5163 }
5164
5165 int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
5166                       __u64 start, __u64 len)
5167 {
5168         if (ext4_has_inline_data(inode)) {
5169                 int has_inline;
5170
5171                 down_read(&EXT4_I(inode)->xattr_sem);
5172                 has_inline = ext4_has_inline_data(inode);
5173                 up_read(&EXT4_I(inode)->xattr_sem);
5174                 if (has_inline)
5175                         return 0;
5176         }
5177
5178         return _ext4_fiemap(inode, fieinfo, start, len,
5179                             ext4_fill_es_cache_info);
5180 }
5181
5182
5183 /*
5184  * ext4_access_path:
5185  * Function to access the path buffer for marking it dirty.
5186  * It also checks if there are sufficient credits left in the journal handle
5187  * to update path.
5188  */
5189 static int
5190 ext4_access_path(handle_t *handle, struct inode *inode,
5191                 struct ext4_ext_path *path)
5192 {
5193         int credits, err;
5194
5195         if (!ext4_handle_valid(handle))
5196                 return 0;
5197
5198         /*
5199          * Check if need to extend journal credits
5200          * 3 for leaf, sb, and inode plus 2 (bmap and group
5201          * descriptor) for each block group; assume two block
5202          * groups
5203          */
5204         credits = ext4_writepage_trans_blocks(inode);
5205         err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
5206         if (err < 0)
5207                 return err;
5208
5209         err = ext4_ext_get_access(handle, inode, path);
5210         return err;
5211 }
5212
5213 /*
5214  * ext4_ext_shift_path_extents:
5215  * Shift the extents of a path structure lying between path[depth].p_ext
5216  * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5217  * if it is right shift or left shift operation.
5218  */
5219 static int
5220 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5221                             struct inode *inode, handle_t *handle,
5222                             enum SHIFT_DIRECTION SHIFT)
5223 {
5224         int depth, err = 0;
5225         struct ext4_extent *ex_start, *ex_last;
5226         bool update = false;
5227         depth = path->p_depth;
5228
5229         while (depth >= 0) {
5230                 if (depth == path->p_depth) {
5231                         ex_start = path[depth].p_ext;
5232                         if (!ex_start)
5233                                 return -EFSCORRUPTED;
5234
5235                         ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5236
5237                         err = ext4_access_path(handle, inode, path + depth);
5238                         if (err)
5239                                 goto out;
5240
5241                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5242                                 update = true;
5243
5244                         while (ex_start <= ex_last) {
5245                                 if (SHIFT == SHIFT_LEFT) {
5246                                         le32_add_cpu(&ex_start->ee_block,
5247                                                 -shift);
5248                                         /* Try to merge to the left. */
5249                                         if ((ex_start >
5250                                             EXT_FIRST_EXTENT(path[depth].p_hdr))
5251                                             &&
5252                                             ext4_ext_try_to_merge_right(inode,
5253                                             path, ex_start - 1))
5254                                                 ex_last--;
5255                                         else
5256                                                 ex_start++;
5257                                 } else {
5258                                         le32_add_cpu(&ex_last->ee_block, shift);
5259                                         ext4_ext_try_to_merge_right(inode, path,
5260                                                 ex_last);
5261                                         ex_last--;
5262                                 }
5263                         }
5264                         err = ext4_ext_dirty(handle, inode, path + depth);
5265                         if (err)
5266                                 goto out;
5267
5268                         if (--depth < 0 || !update)
5269                                 break;
5270                 }
5271
5272                 /* Update index too */
5273                 err = ext4_access_path(handle, inode, path + depth);
5274                 if (err)
5275                         goto out;
5276
5277                 if (SHIFT == SHIFT_LEFT)
5278                         le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5279                 else
5280                         le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5281                 err = ext4_ext_dirty(handle, inode, path + depth);
5282                 if (err)
5283                         goto out;
5284
5285                 /* we are done if current index is not a starting index */
5286                 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5287                         break;
5288
5289                 depth--;
5290         }
5291
5292 out:
5293         return err;
5294 }
5295
5296 /*
5297  * ext4_ext_shift_extents:
5298  * All the extents which lies in the range from @start to the last allocated
5299  * block for the @inode are shifted either towards left or right (depending
5300  * upon @SHIFT) by @shift blocks.
5301  * On success, 0 is returned, error otherwise.
5302  */
5303 static int
5304 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5305                        ext4_lblk_t start, ext4_lblk_t shift,
5306                        enum SHIFT_DIRECTION SHIFT)
5307 {
5308         struct ext4_ext_path *path;
5309         int ret = 0, depth;
5310         struct ext4_extent *extent;
5311         ext4_lblk_t stop, *iterator, ex_start, ex_end;
5312
5313         /* Let path point to the last extent */
5314         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5315                                 EXT4_EX_NOCACHE);
5316         if (IS_ERR(path))
5317                 return PTR_ERR(path);
5318
5319         depth = path->p_depth;
5320         extent = path[depth].p_ext;
5321         if (!extent)
5322                 goto out;
5323
5324         stop = le32_to_cpu(extent->ee_block);
5325
5326        /*
5327         * For left shifts, make sure the hole on the left is big enough to
5328         * accommodate the shift.  For right shifts, make sure the last extent
5329         * won't be shifted beyond EXT_MAX_BLOCKS.
5330         */
5331         if (SHIFT == SHIFT_LEFT) {
5332                 path = ext4_find_extent(inode, start - 1, &path,
5333                                         EXT4_EX_NOCACHE);
5334                 if (IS_ERR(path))
5335                         return PTR_ERR(path);
5336                 depth = path->p_depth;
5337                 extent =  path[depth].p_ext;
5338                 if (extent) {
5339                         ex_start = le32_to_cpu(extent->ee_block);
5340                         ex_end = le32_to_cpu(extent->ee_block) +
5341                                 ext4_ext_get_actual_len(extent);
5342                 } else {
5343                         ex_start = 0;
5344                         ex_end = 0;
5345                 }
5346
5347                 if ((start == ex_start && shift > ex_start) ||
5348                     (shift > start - ex_end)) {
5349                         ret = -EINVAL;
5350                         goto out;
5351                 }
5352         } else {
5353                 if (shift > EXT_MAX_BLOCKS -
5354                     (stop + ext4_ext_get_actual_len(extent))) {
5355                         ret = -EINVAL;
5356                         goto out;
5357                 }
5358         }
5359
5360         /*
5361          * In case of left shift, iterator points to start and it is increased
5362          * till we reach stop. In case of right shift, iterator points to stop
5363          * and it is decreased till we reach start.
5364          */
5365         if (SHIFT == SHIFT_LEFT)
5366                 iterator = &start;
5367         else
5368                 iterator = &stop;
5369
5370         /*
5371          * Its safe to start updating extents.  Start and stop are unsigned, so
5372          * in case of right shift if extent with 0 block is reached, iterator
5373          * becomes NULL to indicate the end of the loop.
5374          */
5375         while (iterator && start <= stop) {
5376                 path = ext4_find_extent(inode, *iterator, &path,
5377                                         EXT4_EX_NOCACHE);
5378                 if (IS_ERR(path))
5379                         return PTR_ERR(path);
5380                 depth = path->p_depth;
5381                 extent = path[depth].p_ext;
5382                 if (!extent) {
5383                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5384                                          (unsigned long) *iterator);
5385                         return -EFSCORRUPTED;
5386                 }
5387                 if (SHIFT == SHIFT_LEFT && *iterator >
5388                     le32_to_cpu(extent->ee_block)) {
5389                         /* Hole, move to the next extent */
5390                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5391                                 path[depth].p_ext++;
5392                         } else {
5393                                 *iterator = ext4_ext_next_allocated_block(path);
5394                                 continue;
5395                         }
5396                 }
5397
5398                 if (SHIFT == SHIFT_LEFT) {
5399                         extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5400                         *iterator = le32_to_cpu(extent->ee_block) +
5401                                         ext4_ext_get_actual_len(extent);
5402                 } else {
5403                         extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5404                         if (le32_to_cpu(extent->ee_block) > 0)
5405                                 *iterator = le32_to_cpu(extent->ee_block) - 1;
5406                         else
5407                                 /* Beginning is reached, end of the loop */
5408                                 iterator = NULL;
5409                         /* Update path extent in case we need to stop */
5410                         while (le32_to_cpu(extent->ee_block) < start)
5411                                 extent++;
5412                         path[depth].p_ext = extent;
5413                 }
5414                 ret = ext4_ext_shift_path_extents(path, shift, inode,
5415                                 handle, SHIFT);
5416                 if (ret)
5417                         break;
5418         }
5419 out:
5420         ext4_ext_drop_refs(path);
5421         kfree(path);
5422         return ret;
5423 }
5424
5425 /*
5426  * ext4_collapse_range:
5427  * This implements the fallocate's collapse range functionality for ext4
5428  * Returns: 0 and non-zero on error.
5429  */
5430 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5431 {
5432         struct super_block *sb = inode->i_sb;
5433         ext4_lblk_t punch_start, punch_stop;
5434         handle_t *handle;
5435         unsigned int credits;
5436         loff_t new_size, ioffset;
5437         int ret;
5438
5439         /*
5440          * We need to test this early because xfstests assumes that a
5441          * collapse range of (0, 1) will return EOPNOTSUPP if the file
5442          * system does not support collapse range.
5443          */
5444         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5445                 return -EOPNOTSUPP;
5446
5447         /* Collapse range works only on fs cluster size aligned regions. */
5448         if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5449                 return -EINVAL;
5450
5451         trace_ext4_collapse_range(inode, offset, len);
5452
5453         punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5454         punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5455
5456         /* Call ext4_force_commit to flush all data in case of data=journal. */
5457         if (ext4_should_journal_data(inode)) {
5458                 ret = ext4_force_commit(inode->i_sb);
5459                 if (ret)
5460                         return ret;
5461         }
5462
5463         inode_lock(inode);
5464         /*
5465          * There is no need to overlap collapse range with EOF, in which case
5466          * it is effectively a truncate operation
5467          */
5468         if (offset + len >= inode->i_size) {
5469                 ret = -EINVAL;
5470                 goto out_mutex;
5471         }
5472
5473         /* Currently just for extent based files */
5474         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5475                 ret = -EOPNOTSUPP;
5476                 goto out_mutex;
5477         }
5478
5479         /* Wait for existing dio to complete */
5480         inode_dio_wait(inode);
5481
5482         /*
5483          * Prevent page faults from reinstantiating pages we have released from
5484          * page cache.
5485          */
5486         down_write(&EXT4_I(inode)->i_mmap_sem);
5487
5488         ret = ext4_break_layouts(inode);
5489         if (ret)
5490                 goto out_mmap;
5491
5492         /*
5493          * Need to round down offset to be aligned with page size boundary
5494          * for page size > block size.
5495          */
5496         ioffset = round_down(offset, PAGE_SIZE);
5497         /*
5498          * Write tail of the last page before removed range since it will get
5499          * removed from the page cache below.
5500          */
5501         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5502         if (ret)
5503                 goto out_mmap;
5504         /*
5505          * Write data that will be shifted to preserve them when discarding
5506          * page cache below. We are also protected from pages becoming dirty
5507          * by i_mmap_sem.
5508          */
5509         ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5510                                            LLONG_MAX);
5511         if (ret)
5512                 goto out_mmap;
5513         truncate_pagecache(inode, ioffset);
5514
5515         credits = ext4_writepage_trans_blocks(inode);
5516         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5517         if (IS_ERR(handle)) {
5518                 ret = PTR_ERR(handle);
5519                 goto out_mmap;
5520         }
5521
5522         down_write(&EXT4_I(inode)->i_data_sem);
5523         ext4_discard_preallocations(inode);
5524
5525         ret = ext4_es_remove_extent(inode, punch_start,
5526                                     EXT_MAX_BLOCKS - punch_start);
5527         if (ret) {
5528                 up_write(&EXT4_I(inode)->i_data_sem);
5529                 goto out_stop;
5530         }
5531
5532         ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5533         if (ret) {
5534                 up_write(&EXT4_I(inode)->i_data_sem);
5535                 goto out_stop;
5536         }
5537         ext4_discard_preallocations(inode);
5538
5539         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5540                                      punch_stop - punch_start, SHIFT_LEFT);
5541         if (ret) {
5542                 up_write(&EXT4_I(inode)->i_data_sem);
5543                 goto out_stop;
5544         }
5545
5546         new_size = inode->i_size - len;
5547         i_size_write(inode, new_size);
5548         EXT4_I(inode)->i_disksize = new_size;
5549
5550         up_write(&EXT4_I(inode)->i_data_sem);
5551         if (IS_SYNC(inode))
5552                 ext4_handle_sync(handle);
5553         inode->i_mtime = inode->i_ctime = current_time(inode);
5554         ext4_mark_inode_dirty(handle, inode);
5555         ext4_update_inode_fsync_trans(handle, inode, 1);
5556
5557 out_stop:
5558         ext4_journal_stop(handle);
5559 out_mmap:
5560         up_write(&EXT4_I(inode)->i_mmap_sem);
5561 out_mutex:
5562         inode_unlock(inode);
5563         return ret;
5564 }
5565
5566 /*
5567  * ext4_insert_range:
5568  * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5569  * The data blocks starting from @offset to the EOF are shifted by @len
5570  * towards right to create a hole in the @inode. Inode size is increased
5571  * by len bytes.
5572  * Returns 0 on success, error otherwise.
5573  */
5574 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5575 {
5576         struct super_block *sb = inode->i_sb;
5577         handle_t *handle;
5578         struct ext4_ext_path *path;
5579         struct ext4_extent *extent;
5580         ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5581         unsigned int credits, ee_len;
5582         int ret = 0, depth, split_flag = 0;
5583         loff_t ioffset;
5584
5585         /*
5586          * We need to test this early because xfstests assumes that an
5587          * insert range of (0, 1) will return EOPNOTSUPP if the file
5588          * system does not support insert range.
5589          */
5590         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5591                 return -EOPNOTSUPP;
5592
5593         /* Insert range works only on fs cluster size aligned regions. */
5594         if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5595                 return -EINVAL;
5596
5597         trace_ext4_insert_range(inode, offset, len);
5598
5599         offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5600         len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5601
5602         /* Call ext4_force_commit to flush all data in case of data=journal */
5603         if (ext4_should_journal_data(inode)) {
5604                 ret = ext4_force_commit(inode->i_sb);
5605                 if (ret)
5606                         return ret;
5607         }
5608
5609         inode_lock(inode);
5610         /* Currently just for extent based files */
5611         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5612                 ret = -EOPNOTSUPP;
5613                 goto out_mutex;
5614         }
5615
5616         /* Check whether the maximum file size would be exceeded */
5617         if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5618                 ret = -EFBIG;
5619                 goto out_mutex;
5620         }
5621
5622         /* Offset must be less than i_size */
5623         if (offset >= inode->i_size) {
5624                 ret = -EINVAL;
5625                 goto out_mutex;
5626         }
5627
5628         /* Wait for existing dio to complete */
5629         inode_dio_wait(inode);
5630
5631         /*
5632          * Prevent page faults from reinstantiating pages we have released from
5633          * page cache.
5634          */
5635         down_write(&EXT4_I(inode)->i_mmap_sem);
5636
5637         ret = ext4_break_layouts(inode);
5638         if (ret)
5639                 goto out_mmap;
5640
5641         /*
5642          * Need to round down to align start offset to page size boundary
5643          * for page size > block size.
5644          */
5645         ioffset = round_down(offset, PAGE_SIZE);
5646         /* Write out all dirty pages */
5647         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5648                         LLONG_MAX);
5649         if (ret)
5650                 goto out_mmap;
5651         truncate_pagecache(inode, ioffset);
5652
5653         credits = ext4_writepage_trans_blocks(inode);
5654         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5655         if (IS_ERR(handle)) {
5656                 ret = PTR_ERR(handle);
5657                 goto out_mmap;
5658         }
5659
5660         /* Expand file to avoid data loss if there is error while shifting */
5661         inode->i_size += len;
5662         EXT4_I(inode)->i_disksize += len;
5663         inode->i_mtime = inode->i_ctime = current_time(inode);
5664         ret = ext4_mark_inode_dirty(handle, inode);
5665         if (ret)
5666                 goto out_stop;
5667
5668         down_write(&EXT4_I(inode)->i_data_sem);
5669         ext4_discard_preallocations(inode);
5670
5671         path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5672         if (IS_ERR(path)) {
5673                 up_write(&EXT4_I(inode)->i_data_sem);
5674                 goto out_stop;
5675         }
5676
5677         depth = ext_depth(inode);
5678         extent = path[depth].p_ext;
5679         if (extent) {
5680                 ee_start_lblk = le32_to_cpu(extent->ee_block);
5681                 ee_len = ext4_ext_get_actual_len(extent);
5682
5683                 /*
5684                  * If offset_lblk is not the starting block of extent, split
5685                  * the extent @offset_lblk
5686                  */
5687                 if ((offset_lblk > ee_start_lblk) &&
5688                                 (offset_lblk < (ee_start_lblk + ee_len))) {
5689                         if (ext4_ext_is_unwritten(extent))
5690                                 split_flag = EXT4_EXT_MARK_UNWRIT1 |
5691                                         EXT4_EXT_MARK_UNWRIT2;
5692                         ret = ext4_split_extent_at(handle, inode, &path,
5693                                         offset_lblk, split_flag,
5694                                         EXT4_EX_NOCACHE |
5695                                         EXT4_GET_BLOCKS_PRE_IO |
5696                                         EXT4_GET_BLOCKS_METADATA_NOFAIL);
5697                 }
5698
5699                 ext4_ext_drop_refs(path);
5700                 kfree(path);
5701                 if (ret < 0) {
5702                         up_write(&EXT4_I(inode)->i_data_sem);
5703                         goto out_stop;
5704                 }
5705         } else {
5706                 ext4_ext_drop_refs(path);
5707                 kfree(path);
5708         }
5709
5710         ret = ext4_es_remove_extent(inode, offset_lblk,
5711                         EXT_MAX_BLOCKS - offset_lblk);
5712         if (ret) {
5713                 up_write(&EXT4_I(inode)->i_data_sem);
5714                 goto out_stop;
5715         }
5716
5717         /*
5718          * if offset_lblk lies in a hole which is at start of file, use
5719          * ee_start_lblk to shift extents
5720          */
5721         ret = ext4_ext_shift_extents(inode, handle,
5722                 ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5723                 len_lblk, SHIFT_RIGHT);
5724
5725         up_write(&EXT4_I(inode)->i_data_sem);
5726         if (IS_SYNC(inode))
5727                 ext4_handle_sync(handle);
5728         if (ret >= 0)
5729                 ext4_update_inode_fsync_trans(handle, inode, 1);
5730
5731 out_stop:
5732         ext4_journal_stop(handle);
5733 out_mmap:
5734         up_write(&EXT4_I(inode)->i_mmap_sem);
5735 out_mutex:
5736         inode_unlock(inode);
5737         return ret;
5738 }
5739
5740 /**
5741  * ext4_swap_extents() - Swap extents between two inodes
5742  * @handle: handle for this transaction
5743  * @inode1:     First inode
5744  * @inode2:     Second inode
5745  * @lblk1:      Start block for first inode
5746  * @lblk2:      Start block for second inode
5747  * @count:      Number of blocks to swap
5748  * @unwritten: Mark second inode's extents as unwritten after swap
5749  * @erp:        Pointer to save error value
5750  *
5751  * This helper routine does exactly what is promise "swap extents". All other
5752  * stuff such as page-cache locking consistency, bh mapping consistency or
5753  * extent's data copying must be performed by caller.
5754  * Locking:
5755  *              i_mutex is held for both inodes
5756  *              i_data_sem is locked for write for both inodes
5757  * Assumptions:
5758  *              All pages from requested range are locked for both inodes
5759  */
5760 int
5761 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5762                   struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5763                   ext4_lblk_t count, int unwritten, int *erp)
5764 {
5765         struct ext4_ext_path *path1 = NULL;
5766         struct ext4_ext_path *path2 = NULL;
5767         int replaced_count = 0;
5768
5769         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5770         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5771         BUG_ON(!inode_is_locked(inode1));
5772         BUG_ON(!inode_is_locked(inode2));
5773
5774         *erp = ext4_es_remove_extent(inode1, lblk1, count);
5775         if (unlikely(*erp))
5776                 return 0;
5777         *erp = ext4_es_remove_extent(inode2, lblk2, count);
5778         if (unlikely(*erp))
5779                 return 0;
5780
5781         while (count) {
5782                 struct ext4_extent *ex1, *ex2, tmp_ex;
5783                 ext4_lblk_t e1_blk, e2_blk;
5784                 int e1_len, e2_len, len;
5785                 int split = 0;
5786
5787                 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5788                 if (IS_ERR(path1)) {
5789                         *erp = PTR_ERR(path1);
5790                         path1 = NULL;
5791                 finish:
5792                         count = 0;
5793                         goto repeat;
5794                 }
5795                 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5796                 if (IS_ERR(path2)) {
5797                         *erp = PTR_ERR(path2);
5798                         path2 = NULL;
5799                         goto finish;
5800                 }
5801                 ex1 = path1[path1->p_depth].p_ext;
5802                 ex2 = path2[path2->p_depth].p_ext;
5803                 /* Do we have somthing to swap ? */
5804                 if (unlikely(!ex2 || !ex1))
5805                         goto finish;
5806
5807                 e1_blk = le32_to_cpu(ex1->ee_block);
5808                 e2_blk = le32_to_cpu(ex2->ee_block);
5809                 e1_len = ext4_ext_get_actual_len(ex1);
5810                 e2_len = ext4_ext_get_actual_len(ex2);
5811
5812                 /* Hole handling */
5813                 if (!in_range(lblk1, e1_blk, e1_len) ||
5814                     !in_range(lblk2, e2_blk, e2_len)) {
5815                         ext4_lblk_t next1, next2;
5816
5817                         /* if hole after extent, then go to next extent */
5818                         next1 = ext4_ext_next_allocated_block(path1);
5819                         next2 = ext4_ext_next_allocated_block(path2);
5820                         /* If hole before extent, then shift to that extent */
5821                         if (e1_blk > lblk1)
5822                                 next1 = e1_blk;
5823                         if (e2_blk > lblk2)
5824                                 next2 = e2_blk;
5825                         /* Do we have something to swap */
5826                         if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5827                                 goto finish;
5828                         /* Move to the rightest boundary */
5829                         len = next1 - lblk1;
5830                         if (len < next2 - lblk2)
5831                                 len = next2 - lblk2;
5832                         if (len > count)
5833                                 len = count;
5834                         lblk1 += len;
5835                         lblk2 += len;
5836                         count -= len;
5837                         goto repeat;
5838                 }
5839
5840                 /* Prepare left boundary */
5841                 if (e1_blk < lblk1) {
5842                         split = 1;
5843                         *erp = ext4_force_split_extent_at(handle, inode1,
5844                                                 &path1, lblk1, 0);
5845                         if (unlikely(*erp))
5846                                 goto finish;
5847                 }
5848                 if (e2_blk < lblk2) {
5849                         split = 1;
5850                         *erp = ext4_force_split_extent_at(handle, inode2,
5851                                                 &path2,  lblk2, 0);
5852                         if (unlikely(*erp))
5853                                 goto finish;
5854                 }
5855                 /* ext4_split_extent_at() may result in leaf extent split,
5856                  * path must to be revalidated. */
5857                 if (split)
5858                         goto repeat;
5859
5860                 /* Prepare right boundary */
5861                 len = count;
5862                 if (len > e1_blk + e1_len - lblk1)
5863                         len = e1_blk + e1_len - lblk1;
5864                 if (len > e2_blk + e2_len - lblk2)
5865                         len = e2_blk + e2_len - lblk2;
5866
5867                 if (len != e1_len) {
5868                         split = 1;
5869                         *erp = ext4_force_split_extent_at(handle, inode1,
5870                                                 &path1, lblk1 + len, 0);
5871                         if (unlikely(*erp))
5872                                 goto finish;
5873                 }
5874                 if (len != e2_len) {
5875                         split = 1;
5876                         *erp = ext4_force_split_extent_at(handle, inode2,
5877                                                 &path2, lblk2 + len, 0);
5878                         if (*erp)
5879                                 goto finish;
5880                 }
5881                 /* ext4_split_extent_at() may result in leaf extent split,
5882                  * path must to be revalidated. */
5883                 if (split)
5884                         goto repeat;
5885
5886                 BUG_ON(e2_len != e1_len);
5887                 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5888                 if (unlikely(*erp))
5889                         goto finish;
5890                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5891                 if (unlikely(*erp))
5892                         goto finish;
5893
5894                 /* Both extents are fully inside boundaries. Swap it now */
5895                 tmp_ex = *ex1;
5896                 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5897                 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5898                 ex1->ee_len = cpu_to_le16(e2_len);
5899                 ex2->ee_len = cpu_to_le16(e1_len);
5900                 if (unwritten)
5901                         ext4_ext_mark_unwritten(ex2);
5902                 if (ext4_ext_is_unwritten(&tmp_ex))
5903                         ext4_ext_mark_unwritten(ex1);
5904
5905                 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5906                 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5907                 *erp = ext4_ext_dirty(handle, inode2, path2 +
5908                                       path2->p_depth);
5909                 if (unlikely(*erp))
5910                         goto finish;
5911                 *erp = ext4_ext_dirty(handle, inode1, path1 +
5912                                       path1->p_depth);
5913                 /*
5914                  * Looks scarry ah..? second inode already points to new blocks,
5915                  * and it was successfully dirtied. But luckily error may happen
5916                  * only due to journal error, so full transaction will be
5917                  * aborted anyway.
5918                  */
5919                 if (unlikely(*erp))
5920                         goto finish;
5921                 lblk1 += len;
5922                 lblk2 += len;
5923                 replaced_count += len;
5924                 count -= len;
5925
5926         repeat:
5927                 ext4_ext_drop_refs(path1);
5928                 kfree(path1);
5929                 ext4_ext_drop_refs(path2);
5930                 kfree(path2);
5931                 path1 = path2 = NULL;
5932         }
5933         return replaced_count;
5934 }
5935
5936 /*
5937  * ext4_clu_mapped - determine whether any block in a logical cluster has
5938  *                   been mapped to a physical cluster
5939  *
5940  * @inode - file containing the logical cluster
5941  * @lclu - logical cluster of interest
5942  *
5943  * Returns 1 if any block in the logical cluster is mapped, signifying
5944  * that a physical cluster has been allocated for it.  Otherwise,
5945  * returns 0.  Can also return negative error codes.  Derived from
5946  * ext4_ext_map_blocks().
5947  */
5948 int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5949 {
5950         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5951         struct ext4_ext_path *path;
5952         int depth, mapped = 0, err = 0;
5953         struct ext4_extent *extent;
5954         ext4_lblk_t first_lblk, first_lclu, last_lclu;
5955
5956         /* search for the extent closest to the first block in the cluster */
5957         path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5958         if (IS_ERR(path)) {
5959                 err = PTR_ERR(path);
5960                 path = NULL;
5961                 goto out;
5962         }
5963
5964         depth = ext_depth(inode);
5965
5966         /*
5967          * A consistent leaf must not be empty.  This situation is possible,
5968          * though, _during_ tree modification, and it's why an assert can't
5969          * be put in ext4_find_extent().
5970          */
5971         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5972                 EXT4_ERROR_INODE(inode,
5973                     "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5974                                  (unsigned long) EXT4_C2B(sbi, lclu),
5975                                  depth, path[depth].p_block);
5976                 err = -EFSCORRUPTED;
5977                 goto out;
5978         }
5979
5980         extent = path[depth].p_ext;
5981
5982         /* can't be mapped if the extent tree is empty */
5983         if (extent == NULL)
5984                 goto out;
5985
5986         first_lblk = le32_to_cpu(extent->ee_block);
5987         first_lclu = EXT4_B2C(sbi, first_lblk);
5988
5989         /*
5990          * Three possible outcomes at this point - found extent spanning
5991          * the target cluster, to the left of the target cluster, or to the
5992          * right of the target cluster.  The first two cases are handled here.
5993          * The last case indicates the target cluster is not mapped.
5994          */
5995         if (lclu >= first_lclu) {
5996                 last_lclu = EXT4_B2C(sbi, first_lblk +
5997                                      ext4_ext_get_actual_len(extent) - 1);
5998                 if (lclu <= last_lclu) {
5999                         mapped = 1;
6000                 } else {
6001                         first_lblk = ext4_ext_next_allocated_block(path);
6002                         first_lclu = EXT4_B2C(sbi, first_lblk);
6003                         if (lclu == first_lclu)
6004                                 mapped = 1;
6005                 }
6006         }
6007
6008 out:
6009         ext4_ext_drop_refs(path);
6010         kfree(path);
6011
6012         return err ? err : mapped;
6013 }