Merge tag 'dma-mapping-5.12' of git://git.infradead.org/users/hch/dma-mapping
[linux-2.6-microblaze.git] / fs / ocfs2 / refcounttree.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* -*- mode: c; c-basic-offset: 8; -*-
3  * vim: noexpandtab sw=8 ts=8 sts=0:
4  *
5  * refcounttree.c
6  *
7  * Copyright (C) 2009 Oracle.  All rights reserved.
8  */
9
10 #include <linux/sort.h>
11 #include <cluster/masklog.h>
12 #include "ocfs2.h"
13 #include "inode.h"
14 #include "alloc.h"
15 #include "suballoc.h"
16 #include "journal.h"
17 #include "uptodate.h"
18 #include "super.h"
19 #include "buffer_head_io.h"
20 #include "blockcheck.h"
21 #include "refcounttree.h"
22 #include "sysfile.h"
23 #include "dlmglue.h"
24 #include "extent_map.h"
25 #include "aops.h"
26 #include "xattr.h"
27 #include "namei.h"
28 #include "ocfs2_trace.h"
29 #include "file.h"
30
31 #include <linux/bio.h>
32 #include <linux/blkdev.h>
33 #include <linux/slab.h>
34 #include <linux/writeback.h>
35 #include <linux/pagevec.h>
36 #include <linux/swap.h>
37 #include <linux/security.h>
38 #include <linux/fsnotify.h>
39 #include <linux/quotaops.h>
40 #include <linux/namei.h>
41 #include <linux/mount.h>
42 #include <linux/posix_acl.h>
43
44 struct ocfs2_cow_context {
45         struct inode *inode;
46         u32 cow_start;
47         u32 cow_len;
48         struct ocfs2_extent_tree data_et;
49         struct ocfs2_refcount_tree *ref_tree;
50         struct buffer_head *ref_root_bh;
51         struct ocfs2_alloc_context *meta_ac;
52         struct ocfs2_alloc_context *data_ac;
53         struct ocfs2_cached_dealloc_ctxt dealloc;
54         void *cow_object;
55         struct ocfs2_post_refcount *post_refcount;
56         int extra_credits;
57         int (*get_clusters)(struct ocfs2_cow_context *context,
58                             u32 v_cluster, u32 *p_cluster,
59                             u32 *num_clusters,
60                             unsigned int *extent_flags);
61         int (*cow_duplicate_clusters)(handle_t *handle,
62                                       struct inode *inode,
63                                       u32 cpos, u32 old_cluster,
64                                       u32 new_cluster, u32 new_len);
65 };
66
67 static inline struct ocfs2_refcount_tree *
68 cache_info_to_refcount(struct ocfs2_caching_info *ci)
69 {
70         return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
71 }
72
73 static int ocfs2_validate_refcount_block(struct super_block *sb,
74                                          struct buffer_head *bh)
75 {
76         int rc;
77         struct ocfs2_refcount_block *rb =
78                 (struct ocfs2_refcount_block *)bh->b_data;
79
80         trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
81
82         BUG_ON(!buffer_uptodate(bh));
83
84         /*
85          * If the ecc fails, we return the error but otherwise
86          * leave the filesystem running.  We know any error is
87          * local to this block.
88          */
89         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
90         if (rc) {
91                 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
92                      (unsigned long long)bh->b_blocknr);
93                 return rc;
94         }
95
96
97         if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
98                 rc = ocfs2_error(sb,
99                                  "Refcount block #%llu has bad signature %.*s\n",
100                                  (unsigned long long)bh->b_blocknr, 7,
101                                  rb->rf_signature);
102                 goto out;
103         }
104
105         if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
106                 rc = ocfs2_error(sb,
107                                  "Refcount block #%llu has an invalid rf_blkno of %llu\n",
108                                  (unsigned long long)bh->b_blocknr,
109                                  (unsigned long long)le64_to_cpu(rb->rf_blkno));
110                 goto out;
111         }
112
113         if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
114                 rc = ocfs2_error(sb,
115                                  "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
116                                  (unsigned long long)bh->b_blocknr,
117                                  le32_to_cpu(rb->rf_fs_generation));
118                 goto out;
119         }
120 out:
121         return rc;
122 }
123
124 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
125                                      u64 rb_blkno,
126                                      struct buffer_head **bh)
127 {
128         int rc;
129         struct buffer_head *tmp = *bh;
130
131         rc = ocfs2_read_block(ci, rb_blkno, &tmp,
132                               ocfs2_validate_refcount_block);
133
134         /* If ocfs2_read_block() got us a new bh, pass it up. */
135         if (!rc && !*bh)
136                 *bh = tmp;
137
138         return rc;
139 }
140
141 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
142 {
143         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
144
145         return rf->rf_blkno;
146 }
147
148 static struct super_block *
149 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
150 {
151         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
152
153         return rf->rf_sb;
154 }
155
156 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
157 __acquires(&rf->rf_lock)
158 {
159         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
160
161         spin_lock(&rf->rf_lock);
162 }
163
164 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
165 __releases(&rf->rf_lock)
166 {
167         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
168
169         spin_unlock(&rf->rf_lock);
170 }
171
172 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
173 {
174         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
175
176         mutex_lock(&rf->rf_io_mutex);
177 }
178
179 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
180 {
181         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
182
183         mutex_unlock(&rf->rf_io_mutex);
184 }
185
186 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
187         .co_owner               = ocfs2_refcount_cache_owner,
188         .co_get_super           = ocfs2_refcount_cache_get_super,
189         .co_cache_lock          = ocfs2_refcount_cache_lock,
190         .co_cache_unlock        = ocfs2_refcount_cache_unlock,
191         .co_io_lock             = ocfs2_refcount_cache_io_lock,
192         .co_io_unlock           = ocfs2_refcount_cache_io_unlock,
193 };
194
195 static struct ocfs2_refcount_tree *
196 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
197 {
198         struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
199         struct ocfs2_refcount_tree *tree = NULL;
200
201         while (n) {
202                 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
203
204                 if (blkno < tree->rf_blkno)
205                         n = n->rb_left;
206                 else if (blkno > tree->rf_blkno)
207                         n = n->rb_right;
208                 else
209                         return tree;
210         }
211
212         return NULL;
213 }
214
215 /* osb_lock is already locked. */
216 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
217                                        struct ocfs2_refcount_tree *new)
218 {
219         u64 rf_blkno = new->rf_blkno;
220         struct rb_node *parent = NULL;
221         struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
222         struct ocfs2_refcount_tree *tmp;
223
224         while (*p) {
225                 parent = *p;
226
227                 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
228                                rf_node);
229
230                 if (rf_blkno < tmp->rf_blkno)
231                         p = &(*p)->rb_left;
232                 else if (rf_blkno > tmp->rf_blkno)
233                         p = &(*p)->rb_right;
234                 else {
235                         /* This should never happen! */
236                         mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
237                              (unsigned long long)rf_blkno);
238                         BUG();
239                 }
240         }
241
242         rb_link_node(&new->rf_node, parent, p);
243         rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
244 }
245
246 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
247 {
248         ocfs2_metadata_cache_exit(&tree->rf_ci);
249         ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
250         ocfs2_lock_res_free(&tree->rf_lockres);
251         kfree(tree);
252 }
253
254 static inline void
255 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
256                                         struct ocfs2_refcount_tree *tree)
257 {
258         rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
259         if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
260                 osb->osb_ref_tree_lru = NULL;
261 }
262
263 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
264                                         struct ocfs2_refcount_tree *tree)
265 {
266         spin_lock(&osb->osb_lock);
267         ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
268         spin_unlock(&osb->osb_lock);
269 }
270
271 static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
272 {
273         struct ocfs2_refcount_tree *tree =
274                 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
275
276         ocfs2_free_refcount_tree(tree);
277 }
278
279 static inline void
280 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
281 {
282         kref_get(&tree->rf_getcnt);
283 }
284
285 static inline void
286 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
287 {
288         kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
289 }
290
291 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
292                                                struct super_block *sb)
293 {
294         ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
295         mutex_init(&new->rf_io_mutex);
296         new->rf_sb = sb;
297         spin_lock_init(&new->rf_lock);
298 }
299
300 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
301                                         struct ocfs2_refcount_tree *new,
302                                         u64 rf_blkno, u32 generation)
303 {
304         init_rwsem(&new->rf_sem);
305         ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
306                                      rf_blkno, generation);
307 }
308
309 static struct ocfs2_refcount_tree*
310 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
311 {
312         struct ocfs2_refcount_tree *new;
313
314         new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
315         if (!new)
316                 return NULL;
317
318         new->rf_blkno = rf_blkno;
319         kref_init(&new->rf_getcnt);
320         ocfs2_init_refcount_tree_ci(new, osb->sb);
321
322         return new;
323 }
324
325 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
326                                    struct ocfs2_refcount_tree **ret_tree)
327 {
328         int ret = 0;
329         struct ocfs2_refcount_tree *tree, *new = NULL;
330         struct buffer_head *ref_root_bh = NULL;
331         struct ocfs2_refcount_block *ref_rb;
332
333         spin_lock(&osb->osb_lock);
334         if (osb->osb_ref_tree_lru &&
335             osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
336                 tree = osb->osb_ref_tree_lru;
337         else
338                 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
339         if (tree)
340                 goto out;
341
342         spin_unlock(&osb->osb_lock);
343
344         new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
345         if (!new) {
346                 ret = -ENOMEM;
347                 mlog_errno(ret);
348                 return ret;
349         }
350         /*
351          * We need the generation to create the refcount tree lock and since
352          * it isn't changed during the tree modification, we are safe here to
353          * read without protection.
354          * We also have to purge the cache after we create the lock since the
355          * refcount block may have the stale data. It can only be trusted when
356          * we hold the refcount lock.
357          */
358         ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
359         if (ret) {
360                 mlog_errno(ret);
361                 ocfs2_metadata_cache_exit(&new->rf_ci);
362                 kfree(new);
363                 return ret;
364         }
365
366         ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
367         new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
368         ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
369                                       new->rf_generation);
370         ocfs2_metadata_cache_purge(&new->rf_ci);
371
372         spin_lock(&osb->osb_lock);
373         tree = ocfs2_find_refcount_tree(osb, rf_blkno);
374         if (tree)
375                 goto out;
376
377         ocfs2_insert_refcount_tree(osb, new);
378
379         tree = new;
380         new = NULL;
381
382 out:
383         *ret_tree = tree;
384
385         osb->osb_ref_tree_lru = tree;
386
387         spin_unlock(&osb->osb_lock);
388
389         if (new)
390                 ocfs2_free_refcount_tree(new);
391
392         brelse(ref_root_bh);
393         return ret;
394 }
395
396 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
397 {
398         int ret;
399         struct buffer_head *di_bh = NULL;
400         struct ocfs2_dinode *di;
401
402         ret = ocfs2_read_inode_block(inode, &di_bh);
403         if (ret) {
404                 mlog_errno(ret);
405                 goto out;
406         }
407
408         BUG_ON(!ocfs2_is_refcount_inode(inode));
409
410         di = (struct ocfs2_dinode *)di_bh->b_data;
411         *ref_blkno = le64_to_cpu(di->i_refcount_loc);
412         brelse(di_bh);
413 out:
414         return ret;
415 }
416
417 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
418                                       struct ocfs2_refcount_tree *tree, int rw)
419 {
420         int ret;
421
422         ret = ocfs2_refcount_lock(tree, rw);
423         if (ret) {
424                 mlog_errno(ret);
425                 goto out;
426         }
427
428         if (rw)
429                 down_write(&tree->rf_sem);
430         else
431                 down_read(&tree->rf_sem);
432
433 out:
434         return ret;
435 }
436
437 /*
438  * Lock the refcount tree pointed by ref_blkno and return the tree.
439  * In most case, we lock the tree and read the refcount block.
440  * So read it here if the caller really needs it.
441  *
442  * If the tree has been re-created by other node, it will free the
443  * old one and re-create it.
444  */
445 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
446                              u64 ref_blkno, int rw,
447                              struct ocfs2_refcount_tree **ret_tree,
448                              struct buffer_head **ref_bh)
449 {
450         int ret, delete_tree = 0;
451         struct ocfs2_refcount_tree *tree = NULL;
452         struct buffer_head *ref_root_bh = NULL;
453         struct ocfs2_refcount_block *rb;
454
455 again:
456         ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
457         if (ret) {
458                 mlog_errno(ret);
459                 return ret;
460         }
461
462         ocfs2_refcount_tree_get(tree);
463
464         ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
465         if (ret) {
466                 mlog_errno(ret);
467                 ocfs2_refcount_tree_put(tree);
468                 goto out;
469         }
470
471         ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
472                                         &ref_root_bh);
473         if (ret) {
474                 mlog_errno(ret);
475                 ocfs2_unlock_refcount_tree(osb, tree, rw);
476                 goto out;
477         }
478
479         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
480         /*
481          * If the refcount block has been freed and re-created, we may need
482          * to recreate the refcount tree also.
483          *
484          * Here we just remove the tree from the rb-tree, and the last
485          * kref holder will unlock and delete this refcount_tree.
486          * Then we goto "again" and ocfs2_get_refcount_tree will create
487          * the new refcount tree for us.
488          */
489         if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
490                 if (!tree->rf_removed) {
491                         ocfs2_erase_refcount_tree_from_list(osb, tree);
492                         tree->rf_removed = 1;
493                         delete_tree = 1;
494                 }
495
496                 ocfs2_unlock_refcount_tree(osb, tree, rw);
497                 /*
498                  * We get an extra reference when we create the refcount
499                  * tree, so another put will destroy it.
500                  */
501                 if (delete_tree)
502                         ocfs2_refcount_tree_put(tree);
503                 brelse(ref_root_bh);
504                 ref_root_bh = NULL;
505                 goto again;
506         }
507
508         *ret_tree = tree;
509         if (ref_bh) {
510                 *ref_bh = ref_root_bh;
511                 ref_root_bh = NULL;
512         }
513 out:
514         brelse(ref_root_bh);
515         return ret;
516 }
517
518 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
519                                 struct ocfs2_refcount_tree *tree, int rw)
520 {
521         if (rw)
522                 up_write(&tree->rf_sem);
523         else
524                 up_read(&tree->rf_sem);
525
526         ocfs2_refcount_unlock(tree, rw);
527         ocfs2_refcount_tree_put(tree);
528 }
529
530 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
531 {
532         struct rb_node *node;
533         struct ocfs2_refcount_tree *tree;
534         struct rb_root *root = &osb->osb_rf_lock_tree;
535
536         while ((node = rb_last(root)) != NULL) {
537                 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
538
539                 trace_ocfs2_purge_refcount_trees(
540                                 (unsigned long long) tree->rf_blkno);
541
542                 rb_erase(&tree->rf_node, root);
543                 ocfs2_free_refcount_tree(tree);
544         }
545 }
546
547 /*
548  * Create a refcount tree for an inode.
549  * We take for granted that the inode is already locked.
550  */
551 static int ocfs2_create_refcount_tree(struct inode *inode,
552                                       struct buffer_head *di_bh)
553 {
554         int ret;
555         handle_t *handle = NULL;
556         struct ocfs2_alloc_context *meta_ac = NULL;
557         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
558         struct ocfs2_inode_info *oi = OCFS2_I(inode);
559         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
560         struct buffer_head *new_bh = NULL;
561         struct ocfs2_refcount_block *rb;
562         struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
563         u16 suballoc_bit_start;
564         u32 num_got;
565         u64 suballoc_loc, first_blkno;
566
567         BUG_ON(ocfs2_is_refcount_inode(inode));
568
569         trace_ocfs2_create_refcount_tree(
570                 (unsigned long long)oi->ip_blkno);
571
572         ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
573         if (ret) {
574                 mlog_errno(ret);
575                 goto out;
576         }
577
578         handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
579         if (IS_ERR(handle)) {
580                 ret = PTR_ERR(handle);
581                 mlog_errno(ret);
582                 goto out;
583         }
584
585         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
586                                       OCFS2_JOURNAL_ACCESS_WRITE);
587         if (ret) {
588                 mlog_errno(ret);
589                 goto out_commit;
590         }
591
592         ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
593                                    &suballoc_bit_start, &num_got,
594                                    &first_blkno);
595         if (ret) {
596                 mlog_errno(ret);
597                 goto out_commit;
598         }
599
600         new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
601         if (!new_tree) {
602                 ret = -ENOMEM;
603                 mlog_errno(ret);
604                 goto out_commit;
605         }
606
607         new_bh = sb_getblk(inode->i_sb, first_blkno);
608         if (!new_bh) {
609                 ret = -ENOMEM;
610                 mlog_errno(ret);
611                 goto out_commit;
612         }
613         ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
614
615         ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
616                                       OCFS2_JOURNAL_ACCESS_CREATE);
617         if (ret) {
618                 mlog_errno(ret);
619                 goto out_commit;
620         }
621
622         /* Initialize ocfs2_refcount_block. */
623         rb = (struct ocfs2_refcount_block *)new_bh->b_data;
624         memset(rb, 0, inode->i_sb->s_blocksize);
625         strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
626         rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
627         rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
628         rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
629         rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
630         rb->rf_blkno = cpu_to_le64(first_blkno);
631         rb->rf_count = cpu_to_le32(1);
632         rb->rf_records.rl_count =
633                         cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
634         spin_lock(&osb->osb_lock);
635         rb->rf_generation = osb->s_next_generation++;
636         spin_unlock(&osb->osb_lock);
637
638         ocfs2_journal_dirty(handle, new_bh);
639
640         spin_lock(&oi->ip_lock);
641         oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
642         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
643         di->i_refcount_loc = cpu_to_le64(first_blkno);
644         spin_unlock(&oi->ip_lock);
645
646         trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
647
648         ocfs2_journal_dirty(handle, di_bh);
649
650         /*
651          * We have to init the tree lock here since it will use
652          * the generation number to create it.
653          */
654         new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
655         ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
656                                       new_tree->rf_generation);
657
658         spin_lock(&osb->osb_lock);
659         tree = ocfs2_find_refcount_tree(osb, first_blkno);
660
661         /*
662          * We've just created a new refcount tree in this block.  If
663          * we found a refcount tree on the ocfs2_super, it must be
664          * one we just deleted.  We free the old tree before
665          * inserting the new tree.
666          */
667         BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
668         if (tree)
669                 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
670         ocfs2_insert_refcount_tree(osb, new_tree);
671         spin_unlock(&osb->osb_lock);
672         new_tree = NULL;
673         if (tree)
674                 ocfs2_refcount_tree_put(tree);
675
676 out_commit:
677         ocfs2_commit_trans(osb, handle);
678
679 out:
680         if (new_tree) {
681                 ocfs2_metadata_cache_exit(&new_tree->rf_ci);
682                 kfree(new_tree);
683         }
684
685         brelse(new_bh);
686         if (meta_ac)
687                 ocfs2_free_alloc_context(meta_ac);
688
689         return ret;
690 }
691
692 static int ocfs2_set_refcount_tree(struct inode *inode,
693                                    struct buffer_head *di_bh,
694                                    u64 refcount_loc)
695 {
696         int ret;
697         handle_t *handle = NULL;
698         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
699         struct ocfs2_inode_info *oi = OCFS2_I(inode);
700         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
701         struct buffer_head *ref_root_bh = NULL;
702         struct ocfs2_refcount_block *rb;
703         struct ocfs2_refcount_tree *ref_tree;
704
705         BUG_ON(ocfs2_is_refcount_inode(inode));
706
707         ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
708                                        &ref_tree, &ref_root_bh);
709         if (ret) {
710                 mlog_errno(ret);
711                 return ret;
712         }
713
714         handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
715         if (IS_ERR(handle)) {
716                 ret = PTR_ERR(handle);
717                 mlog_errno(ret);
718                 goto out;
719         }
720
721         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
722                                       OCFS2_JOURNAL_ACCESS_WRITE);
723         if (ret) {
724                 mlog_errno(ret);
725                 goto out_commit;
726         }
727
728         ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
729                                       OCFS2_JOURNAL_ACCESS_WRITE);
730         if (ret) {
731                 mlog_errno(ret);
732                 goto out_commit;
733         }
734
735         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
736         le32_add_cpu(&rb->rf_count, 1);
737
738         ocfs2_journal_dirty(handle, ref_root_bh);
739
740         spin_lock(&oi->ip_lock);
741         oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
742         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
743         di->i_refcount_loc = cpu_to_le64(refcount_loc);
744         spin_unlock(&oi->ip_lock);
745         ocfs2_journal_dirty(handle, di_bh);
746
747 out_commit:
748         ocfs2_commit_trans(osb, handle);
749 out:
750         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
751         brelse(ref_root_bh);
752
753         return ret;
754 }
755
756 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
757 {
758         int ret, delete_tree = 0;
759         handle_t *handle = NULL;
760         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
761         struct ocfs2_inode_info *oi = OCFS2_I(inode);
762         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
763         struct ocfs2_refcount_block *rb;
764         struct inode *alloc_inode = NULL;
765         struct buffer_head *alloc_bh = NULL;
766         struct buffer_head *blk_bh = NULL;
767         struct ocfs2_refcount_tree *ref_tree;
768         int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
769         u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
770         u16 bit = 0;
771
772         if (!ocfs2_is_refcount_inode(inode))
773                 return 0;
774
775         BUG_ON(!ref_blkno);
776         ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
777         if (ret) {
778                 mlog_errno(ret);
779                 return ret;
780         }
781
782         rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
783
784         /*
785          * If we are the last user, we need to free the block.
786          * So lock the allocator ahead.
787          */
788         if (le32_to_cpu(rb->rf_count) == 1) {
789                 blk = le64_to_cpu(rb->rf_blkno);
790                 bit = le16_to_cpu(rb->rf_suballoc_bit);
791                 if (rb->rf_suballoc_loc)
792                         bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
793                 else
794                         bg_blkno = ocfs2_which_suballoc_group(blk, bit);
795
796                 alloc_inode = ocfs2_get_system_file_inode(osb,
797                                         EXTENT_ALLOC_SYSTEM_INODE,
798                                         le16_to_cpu(rb->rf_suballoc_slot));
799                 if (!alloc_inode) {
800                         ret = -ENOMEM;
801                         mlog_errno(ret);
802                         goto out;
803                 }
804                 inode_lock(alloc_inode);
805
806                 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
807                 if (ret) {
808                         mlog_errno(ret);
809                         goto out_mutex;
810                 }
811
812                 credits += OCFS2_SUBALLOC_FREE;
813         }
814
815         handle = ocfs2_start_trans(osb, credits);
816         if (IS_ERR(handle)) {
817                 ret = PTR_ERR(handle);
818                 mlog_errno(ret);
819                 goto out_unlock;
820         }
821
822         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
823                                       OCFS2_JOURNAL_ACCESS_WRITE);
824         if (ret) {
825                 mlog_errno(ret);
826                 goto out_commit;
827         }
828
829         ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
830                                       OCFS2_JOURNAL_ACCESS_WRITE);
831         if (ret) {
832                 mlog_errno(ret);
833                 goto out_commit;
834         }
835
836         spin_lock(&oi->ip_lock);
837         oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
838         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
839         di->i_refcount_loc = 0;
840         spin_unlock(&oi->ip_lock);
841         ocfs2_journal_dirty(handle, di_bh);
842
843         le32_add_cpu(&rb->rf_count , -1);
844         ocfs2_journal_dirty(handle, blk_bh);
845
846         if (!rb->rf_count) {
847                 delete_tree = 1;
848                 ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
849                 ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
850                                                alloc_bh, bit, bg_blkno, 1);
851                 if (ret)
852                         mlog_errno(ret);
853         }
854
855 out_commit:
856         ocfs2_commit_trans(osb, handle);
857 out_unlock:
858         if (alloc_inode) {
859                 ocfs2_inode_unlock(alloc_inode, 1);
860                 brelse(alloc_bh);
861         }
862 out_mutex:
863         if (alloc_inode) {
864                 inode_unlock(alloc_inode);
865                 iput(alloc_inode);
866         }
867 out:
868         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
869         if (delete_tree)
870                 ocfs2_refcount_tree_put(ref_tree);
871         brelse(blk_bh);
872
873         return ret;
874 }
875
876 static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
877                                           struct buffer_head *ref_leaf_bh,
878                                           u64 cpos, unsigned int len,
879                                           struct ocfs2_refcount_rec *ret_rec,
880                                           int *index)
881 {
882         int i = 0;
883         struct ocfs2_refcount_block *rb =
884                 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
885         struct ocfs2_refcount_rec *rec = NULL;
886
887         for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
888                 rec = &rb->rf_records.rl_recs[i];
889
890                 if (le64_to_cpu(rec->r_cpos) +
891                     le32_to_cpu(rec->r_clusters) <= cpos)
892                         continue;
893                 else if (le64_to_cpu(rec->r_cpos) > cpos)
894                         break;
895
896                 /* ok, cpos fail in this rec. Just return. */
897                 if (ret_rec)
898                         *ret_rec = *rec;
899                 goto out;
900         }
901
902         if (ret_rec) {
903                 /* We meet with a hole here, so fake the rec. */
904                 ret_rec->r_cpos = cpu_to_le64(cpos);
905                 ret_rec->r_refcount = 0;
906                 if (i < le16_to_cpu(rb->rf_records.rl_used) &&
907                     le64_to_cpu(rec->r_cpos) < cpos + len)
908                         ret_rec->r_clusters =
909                                 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
910                 else
911                         ret_rec->r_clusters = cpu_to_le32(len);
912         }
913
914 out:
915         *index = i;
916 }
917
918 /*
919  * Try to remove refcount tree. The mechanism is:
920  * 1) Check whether i_clusters == 0, if no, exit.
921  * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
922  * 3) Check whether we have inline xattr stored outside, if yes, exit.
923  * 4) Remove the tree.
924  */
925 int ocfs2_try_remove_refcount_tree(struct inode *inode,
926                                    struct buffer_head *di_bh)
927 {
928         int ret;
929         struct ocfs2_inode_info *oi = OCFS2_I(inode);
930         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
931
932         down_write(&oi->ip_xattr_sem);
933         down_write(&oi->ip_alloc_sem);
934
935         if (oi->ip_clusters)
936                 goto out;
937
938         if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
939                 goto out;
940
941         if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
942             ocfs2_has_inline_xattr_value_outside(inode, di))
943                 goto out;
944
945         ret = ocfs2_remove_refcount_tree(inode, di_bh);
946         if (ret)
947                 mlog_errno(ret);
948 out:
949         up_write(&oi->ip_alloc_sem);
950         up_write(&oi->ip_xattr_sem);
951         return 0;
952 }
953
954 /*
955  * Find the end range for a leaf refcount block indicated by
956  * el->l_recs[index].e_blkno.
957  */
958 static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
959                                        struct buffer_head *ref_root_bh,
960                                        struct ocfs2_extent_block *eb,
961                                        struct ocfs2_extent_list *el,
962                                        int index,  u32 *cpos_end)
963 {
964         int ret, i, subtree_root;
965         u32 cpos;
966         u64 blkno;
967         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
968         struct ocfs2_path *left_path = NULL, *right_path = NULL;
969         struct ocfs2_extent_tree et;
970         struct ocfs2_extent_list *tmp_el;
971
972         if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
973                 /*
974                  * We have a extent rec after index, so just use the e_cpos
975                  * of the next extent rec.
976                  */
977                 *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
978                 return 0;
979         }
980
981         if (!eb || (eb && !eb->h_next_leaf_blk)) {
982                 /*
983                  * We are the last extent rec, so any high cpos should
984                  * be stored in this leaf refcount block.
985                  */
986                 *cpos_end = UINT_MAX;
987                 return 0;
988         }
989
990         /*
991          * If the extent block isn't the last one, we have to find
992          * the subtree root between this extent block and the next
993          * leaf extent block and get the corresponding e_cpos from
994          * the subroot. Otherwise we may corrupt the b-tree.
995          */
996         ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
997
998         left_path = ocfs2_new_path_from_et(&et);
999         if (!left_path) {
1000                 ret = -ENOMEM;
1001                 mlog_errno(ret);
1002                 goto out;
1003         }
1004
1005         cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1006         ret = ocfs2_find_path(ci, left_path, cpos);
1007         if (ret) {
1008                 mlog_errno(ret);
1009                 goto out;
1010         }
1011
1012         right_path = ocfs2_new_path_from_path(left_path);
1013         if (!right_path) {
1014                 ret = -ENOMEM;
1015                 mlog_errno(ret);
1016                 goto out;
1017         }
1018
1019         ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1020         if (ret) {
1021                 mlog_errno(ret);
1022                 goto out;
1023         }
1024
1025         ret = ocfs2_find_path(ci, right_path, cpos);
1026         if (ret) {
1027                 mlog_errno(ret);
1028                 goto out;
1029         }
1030
1031         subtree_root = ocfs2_find_subtree_root(&et, left_path,
1032                                                right_path);
1033
1034         tmp_el = left_path->p_node[subtree_root].el;
1035         blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1036         for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
1037                 if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1038                         *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1039                         break;
1040                 }
1041         }
1042
1043         BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
1044
1045 out:
1046         ocfs2_free_path(left_path);
1047         ocfs2_free_path(right_path);
1048         return ret;
1049 }
1050
1051 /*
1052  * Given a cpos and len, try to find the refcount record which contains cpos.
1053  * 1. If cpos can be found in one refcount record, return the record.
1054  * 2. If cpos can't be found, return a fake record which start from cpos
1055  *    and end at a small value between cpos+len and start of the next record.
1056  *    This fake record has r_refcount = 0.
1057  */
1058 static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1059                                   struct buffer_head *ref_root_bh,
1060                                   u64 cpos, unsigned int len,
1061                                   struct ocfs2_refcount_rec *ret_rec,
1062                                   int *index,
1063                                   struct buffer_head **ret_bh)
1064 {
1065         int ret = 0, i, found;
1066         u32 low_cpos, cpos_end;
1067         struct ocfs2_extent_list *el;
1068         struct ocfs2_extent_rec *rec = NULL;
1069         struct ocfs2_extent_block *eb = NULL;
1070         struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
1071         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1072         struct ocfs2_refcount_block *rb =
1073                         (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1074
1075         if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
1076                 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
1077                                               ret_rec, index);
1078                 *ret_bh = ref_root_bh;
1079                 get_bh(ref_root_bh);
1080                 return 0;
1081         }
1082
1083         el = &rb->rf_list;
1084         low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1085
1086         if (el->l_tree_depth) {
1087                 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1088                 if (ret) {
1089                         mlog_errno(ret);
1090                         goto out;
1091                 }
1092
1093                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1094                 el = &eb->h_list;
1095
1096                 if (el->l_tree_depth) {
1097                         ret = ocfs2_error(sb,
1098                                           "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
1099                                           (unsigned long long)ocfs2_metadata_cache_owner(ci),
1100                                           (unsigned long long)eb_bh->b_blocknr);
1101                         goto out;
1102                 }
1103         }
1104
1105         found = 0;
1106         for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1107                 rec = &el->l_recs[i];
1108
1109                 if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1110                         found = 1;
1111                         break;
1112                 }
1113         }
1114
1115         if (found) {
1116                 ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1117                                                   eb, el, i, &cpos_end);
1118                 if (ret) {
1119                         mlog_errno(ret);
1120                         goto out;
1121                 }
1122
1123                 if (cpos_end < low_cpos + len)
1124                         len = cpos_end - low_cpos;
1125         }
1126
1127         ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1128                                         &ref_leaf_bh);
1129         if (ret) {
1130                 mlog_errno(ret);
1131                 goto out;
1132         }
1133
1134         ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1135                                       ret_rec, index);
1136         *ret_bh = ref_leaf_bh;
1137 out:
1138         brelse(eb_bh);
1139         return ret;
1140 }
1141
1142 enum ocfs2_ref_rec_contig {
1143         REF_CONTIG_NONE = 0,
1144         REF_CONTIG_LEFT,
1145         REF_CONTIG_RIGHT,
1146         REF_CONTIG_LEFTRIGHT,
1147 };
1148
1149 static enum ocfs2_ref_rec_contig
1150         ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1151                                     int index)
1152 {
1153         if ((rb->rf_records.rl_recs[index].r_refcount ==
1154             rb->rf_records.rl_recs[index + 1].r_refcount) &&
1155             (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1156             le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1157             le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1158                 return REF_CONTIG_RIGHT;
1159
1160         return REF_CONTIG_NONE;
1161 }
1162
1163 static enum ocfs2_ref_rec_contig
1164         ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1165                                   int index)
1166 {
1167         enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1168
1169         if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1170                 ret = ocfs2_refcount_rec_adjacent(rb, index);
1171
1172         if (index > 0) {
1173                 enum ocfs2_ref_rec_contig tmp;
1174
1175                 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1176
1177                 if (tmp == REF_CONTIG_RIGHT) {
1178                         if (ret == REF_CONTIG_RIGHT)
1179                                 ret = REF_CONTIG_LEFTRIGHT;
1180                         else
1181                                 ret = REF_CONTIG_LEFT;
1182                 }
1183         }
1184
1185         return ret;
1186 }
1187
1188 static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1189                                            int index)
1190 {
1191         BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1192                rb->rf_records.rl_recs[index+1].r_refcount);
1193
1194         le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1195                      le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1196
1197         if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1198                 memmove(&rb->rf_records.rl_recs[index + 1],
1199                         &rb->rf_records.rl_recs[index + 2],
1200                         sizeof(struct ocfs2_refcount_rec) *
1201                         (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1202
1203         memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1204                0, sizeof(struct ocfs2_refcount_rec));
1205         le16_add_cpu(&rb->rf_records.rl_used, -1);
1206 }
1207
1208 /*
1209  * Merge the refcount rec if we are contiguous with the adjacent recs.
1210  */
1211 static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1212                                      int index)
1213 {
1214         enum ocfs2_ref_rec_contig contig =
1215                                 ocfs2_refcount_rec_contig(rb, index);
1216
1217         if (contig == REF_CONTIG_NONE)
1218                 return;
1219
1220         if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1221                 BUG_ON(index == 0);
1222                 index--;
1223         }
1224
1225         ocfs2_rotate_refcount_rec_left(rb, index);
1226
1227         if (contig == REF_CONTIG_LEFTRIGHT)
1228                 ocfs2_rotate_refcount_rec_left(rb, index);
1229 }
1230
1231 /*
1232  * Change the refcount indexed by "index" in ref_bh.
1233  * If refcount reaches 0, remove it.
1234  */
1235 static int ocfs2_change_refcount_rec(handle_t *handle,
1236                                      struct ocfs2_caching_info *ci,
1237                                      struct buffer_head *ref_leaf_bh,
1238                                      int index, int merge, int change)
1239 {
1240         int ret;
1241         struct ocfs2_refcount_block *rb =
1242                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1243         struct ocfs2_refcount_list *rl = &rb->rf_records;
1244         struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1245
1246         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1247                                       OCFS2_JOURNAL_ACCESS_WRITE);
1248         if (ret) {
1249                 mlog_errno(ret);
1250                 goto out;
1251         }
1252
1253         trace_ocfs2_change_refcount_rec(
1254                 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1255                 index, le32_to_cpu(rec->r_refcount), change);
1256         le32_add_cpu(&rec->r_refcount, change);
1257
1258         if (!rec->r_refcount) {
1259                 if (index != le16_to_cpu(rl->rl_used) - 1) {
1260                         memmove(rec, rec + 1,
1261                                 (le16_to_cpu(rl->rl_used) - index - 1) *
1262                                 sizeof(struct ocfs2_refcount_rec));
1263                         memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1264                                0, sizeof(struct ocfs2_refcount_rec));
1265                 }
1266
1267                 le16_add_cpu(&rl->rl_used, -1);
1268         } else if (merge)
1269                 ocfs2_refcount_rec_merge(rb, index);
1270
1271         ocfs2_journal_dirty(handle, ref_leaf_bh);
1272 out:
1273         return ret;
1274 }
1275
1276 static int ocfs2_expand_inline_ref_root(handle_t *handle,
1277                                         struct ocfs2_caching_info *ci,
1278                                         struct buffer_head *ref_root_bh,
1279                                         struct buffer_head **ref_leaf_bh,
1280                                         struct ocfs2_alloc_context *meta_ac)
1281 {
1282         int ret;
1283         u16 suballoc_bit_start;
1284         u32 num_got;
1285         u64 suballoc_loc, blkno;
1286         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1287         struct buffer_head *new_bh = NULL;
1288         struct ocfs2_refcount_block *new_rb;
1289         struct ocfs2_refcount_block *root_rb =
1290                         (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1291
1292         ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1293                                       OCFS2_JOURNAL_ACCESS_WRITE);
1294         if (ret) {
1295                 mlog_errno(ret);
1296                 goto out;
1297         }
1298
1299         ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1300                                    &suballoc_bit_start, &num_got,
1301                                    &blkno);
1302         if (ret) {
1303                 mlog_errno(ret);
1304                 goto out;
1305         }
1306
1307         new_bh = sb_getblk(sb, blkno);
1308         if (new_bh == NULL) {
1309                 ret = -ENOMEM;
1310                 mlog_errno(ret);
1311                 goto out;
1312         }
1313         ocfs2_set_new_buffer_uptodate(ci, new_bh);
1314
1315         ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1316                                       OCFS2_JOURNAL_ACCESS_CREATE);
1317         if (ret) {
1318                 mlog_errno(ret);
1319                 goto out;
1320         }
1321
1322         /*
1323          * Initialize ocfs2_refcount_block.
1324          * It should contain the same information as the old root.
1325          * so just memcpy it and change the corresponding field.
1326          */
1327         memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1328
1329         new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1330         new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1331         new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1332         new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1333         new_rb->rf_blkno = cpu_to_le64(blkno);
1334         new_rb->rf_cpos = cpu_to_le32(0);
1335         new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1336         new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1337         ocfs2_journal_dirty(handle, new_bh);
1338
1339         /* Now change the root. */
1340         memset(&root_rb->rf_list, 0, sb->s_blocksize -
1341                offsetof(struct ocfs2_refcount_block, rf_list));
1342         root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1343         root_rb->rf_clusters = cpu_to_le32(1);
1344         root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1345         root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1346         root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1347         root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1348
1349         ocfs2_journal_dirty(handle, ref_root_bh);
1350
1351         trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
1352                 le16_to_cpu(new_rb->rf_records.rl_used));
1353
1354         *ref_leaf_bh = new_bh;
1355         new_bh = NULL;
1356 out:
1357         brelse(new_bh);
1358         return ret;
1359 }
1360
1361 static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1362                                            struct ocfs2_refcount_rec *next)
1363 {
1364         if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1365                 ocfs2_get_ref_rec_low_cpos(next))
1366                 return 1;
1367
1368         return 0;
1369 }
1370
1371 static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1372 {
1373         const struct ocfs2_refcount_rec *l = a, *r = b;
1374         u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1375         u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1376
1377         if (l_cpos > r_cpos)
1378                 return 1;
1379         if (l_cpos < r_cpos)
1380                 return -1;
1381         return 0;
1382 }
1383
1384 static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1385 {
1386         const struct ocfs2_refcount_rec *l = a, *r = b;
1387         u64 l_cpos = le64_to_cpu(l->r_cpos);
1388         u64 r_cpos = le64_to_cpu(r->r_cpos);
1389
1390         if (l_cpos > r_cpos)
1391                 return 1;
1392         if (l_cpos < r_cpos)
1393                 return -1;
1394         return 0;
1395 }
1396
1397 static void swap_refcount_rec(void *a, void *b, int size)
1398 {
1399         struct ocfs2_refcount_rec *l = a, *r = b;
1400
1401         swap(*l, *r);
1402 }
1403
1404 /*
1405  * The refcount cpos are ordered by their 64bit cpos,
1406  * But we will use the low 32 bit to be the e_cpos in the b-tree.
1407  * So we need to make sure that this pos isn't intersected with others.
1408  *
1409  * Note: The refcount block is already sorted by their low 32 bit cpos,
1410  *       So just try the middle pos first, and we will exit when we find
1411  *       the good position.
1412  */
1413 static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1414                                          u32 *split_pos, int *split_index)
1415 {
1416         int num_used = le16_to_cpu(rl->rl_used);
1417         int delta, middle = num_used / 2;
1418
1419         for (delta = 0; delta < middle; delta++) {
1420                 /* Let's check delta earlier than middle */
1421                 if (ocfs2_refcount_rec_no_intersect(
1422                                         &rl->rl_recs[middle - delta - 1],
1423                                         &rl->rl_recs[middle - delta])) {
1424                         *split_index = middle - delta;
1425                         break;
1426                 }
1427
1428                 /* For even counts, don't walk off the end */
1429                 if ((middle + delta + 1) == num_used)
1430                         continue;
1431
1432                 /* Now try delta past middle */
1433                 if (ocfs2_refcount_rec_no_intersect(
1434                                         &rl->rl_recs[middle + delta],
1435                                         &rl->rl_recs[middle + delta + 1])) {
1436                         *split_index = middle + delta + 1;
1437                         break;
1438                 }
1439         }
1440
1441         if (delta >= middle)
1442                 return -ENOSPC;
1443
1444         *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1445         return 0;
1446 }
1447
1448 static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1449                                             struct buffer_head *new_bh,
1450                                             u32 *split_cpos)
1451 {
1452         int split_index = 0, num_moved, ret;
1453         u32 cpos = 0;
1454         struct ocfs2_refcount_block *rb =
1455                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1456         struct ocfs2_refcount_list *rl = &rb->rf_records;
1457         struct ocfs2_refcount_block *new_rb =
1458                         (struct ocfs2_refcount_block *)new_bh->b_data;
1459         struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1460
1461         trace_ocfs2_divide_leaf_refcount_block(
1462                 (unsigned long long)ref_leaf_bh->b_blocknr,
1463                 le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
1464
1465         /*
1466          * XXX: Improvement later.
1467          * If we know all the high 32 bit cpos is the same, no need to sort.
1468          *
1469          * In order to make the whole process safe, we do:
1470          * 1. sort the entries by their low 32 bit cpos first so that we can
1471          *    find the split cpos easily.
1472          * 2. call ocfs2_insert_extent to insert the new refcount block.
1473          * 3. move the refcount rec to the new block.
1474          * 4. sort the entries by their 64 bit cpos.
1475          * 5. dirty the new_rb and rb.
1476          */
1477         sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1478              sizeof(struct ocfs2_refcount_rec),
1479              cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1480
1481         ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1482         if (ret) {
1483                 mlog_errno(ret);
1484                 return ret;
1485         }
1486
1487         new_rb->rf_cpos = cpu_to_le32(cpos);
1488
1489         /* move refcount records starting from split_index to the new block. */
1490         num_moved = le16_to_cpu(rl->rl_used) - split_index;
1491         memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1492                num_moved * sizeof(struct ocfs2_refcount_rec));
1493
1494         /*ok, remove the entries we just moved over to the other block. */
1495         memset(&rl->rl_recs[split_index], 0,
1496                num_moved * sizeof(struct ocfs2_refcount_rec));
1497
1498         /* change old and new rl_used accordingly. */
1499         le16_add_cpu(&rl->rl_used, -num_moved);
1500         new_rl->rl_used = cpu_to_le16(num_moved);
1501
1502         sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1503              sizeof(struct ocfs2_refcount_rec),
1504              cmp_refcount_rec_by_cpos, swap_refcount_rec);
1505
1506         sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1507              sizeof(struct ocfs2_refcount_rec),
1508              cmp_refcount_rec_by_cpos, swap_refcount_rec);
1509
1510         *split_cpos = cpos;
1511         return 0;
1512 }
1513
1514 static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1515                                          struct ocfs2_caching_info *ci,
1516                                          struct buffer_head *ref_root_bh,
1517                                          struct buffer_head *ref_leaf_bh,
1518                                          struct ocfs2_alloc_context *meta_ac)
1519 {
1520         int ret;
1521         u16 suballoc_bit_start;
1522         u32 num_got, new_cpos;
1523         u64 suballoc_loc, blkno;
1524         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1525         struct ocfs2_refcount_block *root_rb =
1526                         (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1527         struct buffer_head *new_bh = NULL;
1528         struct ocfs2_refcount_block *new_rb;
1529         struct ocfs2_extent_tree ref_et;
1530
1531         BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1532
1533         ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1534                                       OCFS2_JOURNAL_ACCESS_WRITE);
1535         if (ret) {
1536                 mlog_errno(ret);
1537                 goto out;
1538         }
1539
1540         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1541                                       OCFS2_JOURNAL_ACCESS_WRITE);
1542         if (ret) {
1543                 mlog_errno(ret);
1544                 goto out;
1545         }
1546
1547         ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1548                                    &suballoc_bit_start, &num_got,
1549                                    &blkno);
1550         if (ret) {
1551                 mlog_errno(ret);
1552                 goto out;
1553         }
1554
1555         new_bh = sb_getblk(sb, blkno);
1556         if (new_bh == NULL) {
1557                 ret = -ENOMEM;
1558                 mlog_errno(ret);
1559                 goto out;
1560         }
1561         ocfs2_set_new_buffer_uptodate(ci, new_bh);
1562
1563         ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1564                                       OCFS2_JOURNAL_ACCESS_CREATE);
1565         if (ret) {
1566                 mlog_errno(ret);
1567                 goto out;
1568         }
1569
1570         /* Initialize ocfs2_refcount_block. */
1571         new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1572         memset(new_rb, 0, sb->s_blocksize);
1573         strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1574         new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1575         new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1576         new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1577         new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1578         new_rb->rf_blkno = cpu_to_le64(blkno);
1579         new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1580         new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1581         new_rb->rf_records.rl_count =
1582                                 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1583         new_rb->rf_generation = root_rb->rf_generation;
1584
1585         ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1586         if (ret) {
1587                 mlog_errno(ret);
1588                 goto out;
1589         }
1590
1591         ocfs2_journal_dirty(handle, ref_leaf_bh);
1592         ocfs2_journal_dirty(handle, new_bh);
1593
1594         ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1595
1596         trace_ocfs2_new_leaf_refcount_block(
1597                         (unsigned long long)new_bh->b_blocknr, new_cpos);
1598
1599         /* Insert the new leaf block with the specific offset cpos. */
1600         ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1601                                   1, 0, meta_ac);
1602         if (ret)
1603                 mlog_errno(ret);
1604
1605 out:
1606         brelse(new_bh);
1607         return ret;
1608 }
1609
1610 static int ocfs2_expand_refcount_tree(handle_t *handle,
1611                                       struct ocfs2_caching_info *ci,
1612                                       struct buffer_head *ref_root_bh,
1613                                       struct buffer_head *ref_leaf_bh,
1614                                       struct ocfs2_alloc_context *meta_ac)
1615 {
1616         int ret;
1617         struct buffer_head *expand_bh = NULL;
1618
1619         if (ref_root_bh == ref_leaf_bh) {
1620                 /*
1621                  * the old root bh hasn't been expanded to a b-tree,
1622                  * so expand it first.
1623                  */
1624                 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1625                                                    &expand_bh, meta_ac);
1626                 if (ret) {
1627                         mlog_errno(ret);
1628                         goto out;
1629                 }
1630         } else {
1631                 expand_bh = ref_leaf_bh;
1632                 get_bh(expand_bh);
1633         }
1634
1635
1636         /* Now add a new refcount block into the tree.*/
1637         ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1638                                             expand_bh, meta_ac);
1639         if (ret)
1640                 mlog_errno(ret);
1641 out:
1642         brelse(expand_bh);
1643         return ret;
1644 }
1645
1646 /*
1647  * Adjust the extent rec in b-tree representing ref_leaf_bh.
1648  *
1649  * Only called when we have inserted a new refcount rec at index 0
1650  * which means ocfs2_extent_rec.e_cpos may need some change.
1651  */
1652 static int ocfs2_adjust_refcount_rec(handle_t *handle,
1653                                      struct ocfs2_caching_info *ci,
1654                                      struct buffer_head *ref_root_bh,
1655                                      struct buffer_head *ref_leaf_bh,
1656                                      struct ocfs2_refcount_rec *rec)
1657 {
1658         int ret = 0, i;
1659         u32 new_cpos, old_cpos;
1660         struct ocfs2_path *path = NULL;
1661         struct ocfs2_extent_tree et;
1662         struct ocfs2_refcount_block *rb =
1663                 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1664         struct ocfs2_extent_list *el;
1665
1666         if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1667                 goto out;
1668
1669         rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1670         old_cpos = le32_to_cpu(rb->rf_cpos);
1671         new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1672         if (old_cpos <= new_cpos)
1673                 goto out;
1674
1675         ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1676
1677         path = ocfs2_new_path_from_et(&et);
1678         if (!path) {
1679                 ret = -ENOMEM;
1680                 mlog_errno(ret);
1681                 goto out;
1682         }
1683
1684         ret = ocfs2_find_path(ci, path, old_cpos);
1685         if (ret) {
1686                 mlog_errno(ret);
1687                 goto out;
1688         }
1689
1690         /*
1691          * 2 more credits, one for the leaf refcount block, one for
1692          * the extent block contains the extent rec.
1693          */
1694         ret = ocfs2_extend_trans(handle, 2);
1695         if (ret < 0) {
1696                 mlog_errno(ret);
1697                 goto out;
1698         }
1699
1700         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1701                                       OCFS2_JOURNAL_ACCESS_WRITE);
1702         if (ret < 0) {
1703                 mlog_errno(ret);
1704                 goto out;
1705         }
1706
1707         ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1708                                       OCFS2_JOURNAL_ACCESS_WRITE);
1709         if (ret < 0) {
1710                 mlog_errno(ret);
1711                 goto out;
1712         }
1713
1714         /* change the leaf extent block first. */
1715         el = path_leaf_el(path);
1716
1717         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1718                 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1719                         break;
1720
1721         BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1722
1723         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1724
1725         /* change the r_cpos in the leaf block. */
1726         rb->rf_cpos = cpu_to_le32(new_cpos);
1727
1728         ocfs2_journal_dirty(handle, path_leaf_bh(path));
1729         ocfs2_journal_dirty(handle, ref_leaf_bh);
1730
1731 out:
1732         ocfs2_free_path(path);
1733         return ret;
1734 }
1735
1736 static int ocfs2_insert_refcount_rec(handle_t *handle,
1737                                      struct ocfs2_caching_info *ci,
1738                                      struct buffer_head *ref_root_bh,
1739                                      struct buffer_head *ref_leaf_bh,
1740                                      struct ocfs2_refcount_rec *rec,
1741                                      int index, int merge,
1742                                      struct ocfs2_alloc_context *meta_ac)
1743 {
1744         int ret;
1745         struct ocfs2_refcount_block *rb =
1746                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1747         struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1748         struct buffer_head *new_bh = NULL;
1749
1750         BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1751
1752         if (rf_list->rl_used == rf_list->rl_count) {
1753                 u64 cpos = le64_to_cpu(rec->r_cpos);
1754                 u32 len = le32_to_cpu(rec->r_clusters);
1755
1756                 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1757                                                  ref_leaf_bh, meta_ac);
1758                 if (ret) {
1759                         mlog_errno(ret);
1760                         goto out;
1761                 }
1762
1763                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1764                                              cpos, len, NULL, &index,
1765                                              &new_bh);
1766                 if (ret) {
1767                         mlog_errno(ret);
1768                         goto out;
1769                 }
1770
1771                 ref_leaf_bh = new_bh;
1772                 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1773                 rf_list = &rb->rf_records;
1774         }
1775
1776         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1777                                       OCFS2_JOURNAL_ACCESS_WRITE);
1778         if (ret) {
1779                 mlog_errno(ret);
1780                 goto out;
1781         }
1782
1783         if (index < le16_to_cpu(rf_list->rl_used))
1784                 memmove(&rf_list->rl_recs[index + 1],
1785                         &rf_list->rl_recs[index],
1786                         (le16_to_cpu(rf_list->rl_used) - index) *
1787                          sizeof(struct ocfs2_refcount_rec));
1788
1789         trace_ocfs2_insert_refcount_rec(
1790                 (unsigned long long)ref_leaf_bh->b_blocknr, index,
1791                 (unsigned long long)le64_to_cpu(rec->r_cpos),
1792                 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
1793
1794         rf_list->rl_recs[index] = *rec;
1795
1796         le16_add_cpu(&rf_list->rl_used, 1);
1797
1798         if (merge)
1799                 ocfs2_refcount_rec_merge(rb, index);
1800
1801         ocfs2_journal_dirty(handle, ref_leaf_bh);
1802
1803         if (index == 0) {
1804                 ret = ocfs2_adjust_refcount_rec(handle, ci,
1805                                                 ref_root_bh,
1806                                                 ref_leaf_bh, rec);
1807                 if (ret)
1808                         mlog_errno(ret);
1809         }
1810 out:
1811         brelse(new_bh);
1812         return ret;
1813 }
1814
1815 /*
1816  * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1817  * This is much simple than our b-tree code.
1818  * split_rec is the new refcount rec we want to insert.
1819  * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1820  * increase refcount or decrease a refcount to non-zero).
1821  * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1822  * rec( in case we decrease a refcount to zero).
1823  */
1824 static int ocfs2_split_refcount_rec(handle_t *handle,
1825                                     struct ocfs2_caching_info *ci,
1826                                     struct buffer_head *ref_root_bh,
1827                                     struct buffer_head *ref_leaf_bh,
1828                                     struct ocfs2_refcount_rec *split_rec,
1829                                     int index, int merge,
1830                                     struct ocfs2_alloc_context *meta_ac,
1831                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
1832 {
1833         int ret, recs_need;
1834         u32 len;
1835         struct ocfs2_refcount_block *rb =
1836                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1837         struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1838         struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1839         struct ocfs2_refcount_rec *tail_rec = NULL;
1840         struct buffer_head *new_bh = NULL;
1841
1842         BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1843
1844         trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
1845                 le32_to_cpu(orig_rec->r_clusters),
1846                 le32_to_cpu(orig_rec->r_refcount),
1847                 le64_to_cpu(split_rec->r_cpos),
1848                 le32_to_cpu(split_rec->r_clusters),
1849                 le32_to_cpu(split_rec->r_refcount));
1850
1851         /*
1852          * If we just need to split the header or tail clusters,
1853          * no more recs are needed, just split is OK.
1854          * Otherwise we at least need one new recs.
1855          */
1856         if (!split_rec->r_refcount &&
1857             (split_rec->r_cpos == orig_rec->r_cpos ||
1858              le64_to_cpu(split_rec->r_cpos) +
1859              le32_to_cpu(split_rec->r_clusters) ==
1860              le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1861                 recs_need = 0;
1862         else
1863                 recs_need = 1;
1864
1865         /*
1866          * We need one more rec if we split in the middle and the new rec have
1867          * some refcount in it.
1868          */
1869         if (split_rec->r_refcount &&
1870             (split_rec->r_cpos != orig_rec->r_cpos &&
1871              le64_to_cpu(split_rec->r_cpos) +
1872              le32_to_cpu(split_rec->r_clusters) !=
1873              le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1874                 recs_need++;
1875
1876         /* If the leaf block don't have enough record, expand it. */
1877         if (le16_to_cpu(rf_list->rl_used) + recs_need >
1878                                          le16_to_cpu(rf_list->rl_count)) {
1879                 struct ocfs2_refcount_rec tmp_rec;
1880                 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1881                 len = le32_to_cpu(orig_rec->r_clusters);
1882                 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1883                                                  ref_leaf_bh, meta_ac);
1884                 if (ret) {
1885                         mlog_errno(ret);
1886                         goto out;
1887                 }
1888
1889                 /*
1890                  * We have to re-get it since now cpos may be moved to
1891                  * another leaf block.
1892                  */
1893                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1894                                              cpos, len, &tmp_rec, &index,
1895                                              &new_bh);
1896                 if (ret) {
1897                         mlog_errno(ret);
1898                         goto out;
1899                 }
1900
1901                 ref_leaf_bh = new_bh;
1902                 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1903                 rf_list = &rb->rf_records;
1904                 orig_rec = &rf_list->rl_recs[index];
1905         }
1906
1907         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1908                                       OCFS2_JOURNAL_ACCESS_WRITE);
1909         if (ret) {
1910                 mlog_errno(ret);
1911                 goto out;
1912         }
1913
1914         /*
1915          * We have calculated out how many new records we need and store
1916          * in recs_need, so spare enough space first by moving the records
1917          * after "index" to the end.
1918          */
1919         if (index != le16_to_cpu(rf_list->rl_used) - 1)
1920                 memmove(&rf_list->rl_recs[index + 1 + recs_need],
1921                         &rf_list->rl_recs[index + 1],
1922                         (le16_to_cpu(rf_list->rl_used) - index - 1) *
1923                          sizeof(struct ocfs2_refcount_rec));
1924
1925         len = (le64_to_cpu(orig_rec->r_cpos) +
1926               le32_to_cpu(orig_rec->r_clusters)) -
1927               (le64_to_cpu(split_rec->r_cpos) +
1928               le32_to_cpu(split_rec->r_clusters));
1929
1930         /*
1931          * If we have "len", the we will split in the tail and move it
1932          * to the end of the space we have just spared.
1933          */
1934         if (len) {
1935                 tail_rec = &rf_list->rl_recs[index + recs_need];
1936
1937                 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1938                 le64_add_cpu(&tail_rec->r_cpos,
1939                              le32_to_cpu(tail_rec->r_clusters) - len);
1940                 tail_rec->r_clusters = cpu_to_le32(len);
1941         }
1942
1943         /*
1944          * If the split pos isn't the same as the original one, we need to
1945          * split in the head.
1946          *
1947          * Note: We have the chance that split_rec.r_refcount = 0,
1948          * recs_need = 0 and len > 0, which means we just cut the head from
1949          * the orig_rec and in that case we have done some modification in
1950          * orig_rec above, so the check for r_cpos is faked.
1951          */
1952         if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1953                 len = le64_to_cpu(split_rec->r_cpos) -
1954                       le64_to_cpu(orig_rec->r_cpos);
1955                 orig_rec->r_clusters = cpu_to_le32(len);
1956                 index++;
1957         }
1958
1959         le16_add_cpu(&rf_list->rl_used, recs_need);
1960
1961         if (split_rec->r_refcount) {
1962                 rf_list->rl_recs[index] = *split_rec;
1963                 trace_ocfs2_split_refcount_rec_insert(
1964                         (unsigned long long)ref_leaf_bh->b_blocknr, index,
1965                         (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1966                         le32_to_cpu(split_rec->r_clusters),
1967                         le32_to_cpu(split_rec->r_refcount));
1968
1969                 if (merge)
1970                         ocfs2_refcount_rec_merge(rb, index);
1971         }
1972
1973         ocfs2_journal_dirty(handle, ref_leaf_bh);
1974
1975 out:
1976         brelse(new_bh);
1977         return ret;
1978 }
1979
1980 static int __ocfs2_increase_refcount(handle_t *handle,
1981                                      struct ocfs2_caching_info *ci,
1982                                      struct buffer_head *ref_root_bh,
1983                                      u64 cpos, u32 len, int merge,
1984                                      struct ocfs2_alloc_context *meta_ac,
1985                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
1986 {
1987         int ret = 0, index;
1988         struct buffer_head *ref_leaf_bh = NULL;
1989         struct ocfs2_refcount_rec rec;
1990         unsigned int set_len = 0;
1991
1992         trace_ocfs2_increase_refcount_begin(
1993              (unsigned long long)ocfs2_metadata_cache_owner(ci),
1994              (unsigned long long)cpos, len);
1995
1996         while (len) {
1997                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1998                                              cpos, len, &rec, &index,
1999                                              &ref_leaf_bh);
2000                 if (ret) {
2001                         mlog_errno(ret);
2002                         goto out;
2003                 }
2004
2005                 set_len = le32_to_cpu(rec.r_clusters);
2006
2007                 /*
2008                  * Here we may meet with 3 situations:
2009                  *
2010                  * 1. If we find an already existing record, and the length
2011                  *    is the same, cool, we just need to increase the r_refcount
2012                  *    and it is OK.
2013                  * 2. If we find a hole, just insert it with r_refcount = 1.
2014                  * 3. If we are in the middle of one extent record, split
2015                  *    it.
2016                  */
2017                 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2018                     set_len <= len) {
2019                         trace_ocfs2_increase_refcount_change(
2020                                 (unsigned long long)cpos, set_len,
2021                                 le32_to_cpu(rec.r_refcount));
2022                         ret = ocfs2_change_refcount_rec(handle, ci,
2023                                                         ref_leaf_bh, index,
2024                                                         merge, 1);
2025                         if (ret) {
2026                                 mlog_errno(ret);
2027                                 goto out;
2028                         }
2029                 } else if (!rec.r_refcount) {
2030                         rec.r_refcount = cpu_to_le32(1);
2031
2032                         trace_ocfs2_increase_refcount_insert(
2033                              (unsigned long long)le64_to_cpu(rec.r_cpos),
2034                              set_len);
2035                         ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
2036                                                         ref_leaf_bh,
2037                                                         &rec, index,
2038                                                         merge, meta_ac);
2039                         if (ret) {
2040                                 mlog_errno(ret);
2041                                 goto out;
2042                         }
2043                 } else  {
2044                         set_len = min((u64)(cpos + len),
2045                                       le64_to_cpu(rec.r_cpos) + set_len) - cpos;
2046                         rec.r_cpos = cpu_to_le64(cpos);
2047                         rec.r_clusters = cpu_to_le32(set_len);
2048                         le32_add_cpu(&rec.r_refcount, 1);
2049
2050                         trace_ocfs2_increase_refcount_split(
2051                              (unsigned long long)le64_to_cpu(rec.r_cpos),
2052                              set_len, le32_to_cpu(rec.r_refcount));
2053                         ret = ocfs2_split_refcount_rec(handle, ci,
2054                                                        ref_root_bh, ref_leaf_bh,
2055                                                        &rec, index, merge,
2056                                                        meta_ac, dealloc);
2057                         if (ret) {
2058                                 mlog_errno(ret);
2059                                 goto out;
2060                         }
2061                 }
2062
2063                 cpos += set_len;
2064                 len -= set_len;
2065                 brelse(ref_leaf_bh);
2066                 ref_leaf_bh = NULL;
2067         }
2068
2069 out:
2070         brelse(ref_leaf_bh);
2071         return ret;
2072 }
2073
2074 static int ocfs2_remove_refcount_extent(handle_t *handle,
2075                                 struct ocfs2_caching_info *ci,
2076                                 struct buffer_head *ref_root_bh,
2077                                 struct buffer_head *ref_leaf_bh,
2078                                 struct ocfs2_alloc_context *meta_ac,
2079                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2080 {
2081         int ret;
2082         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2083         struct ocfs2_refcount_block *rb =
2084                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2085         struct ocfs2_extent_tree et;
2086
2087         BUG_ON(rb->rf_records.rl_used);
2088
2089         trace_ocfs2_remove_refcount_extent(
2090                 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2091                 (unsigned long long)ref_leaf_bh->b_blocknr,
2092                 le32_to_cpu(rb->rf_cpos));
2093
2094         ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2095         ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2096                                   1, meta_ac, dealloc);
2097         if (ret) {
2098                 mlog_errno(ret);
2099                 goto out;
2100         }
2101
2102         ocfs2_remove_from_cache(ci, ref_leaf_bh);
2103
2104         /*
2105          * add the freed block to the dealloc so that it will be freed
2106          * when we run dealloc.
2107          */
2108         ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2109                                         le16_to_cpu(rb->rf_suballoc_slot),
2110                                         le64_to_cpu(rb->rf_suballoc_loc),
2111                                         le64_to_cpu(rb->rf_blkno),
2112                                         le16_to_cpu(rb->rf_suballoc_bit));
2113         if (ret) {
2114                 mlog_errno(ret);
2115                 goto out;
2116         }
2117
2118         ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2119                                       OCFS2_JOURNAL_ACCESS_WRITE);
2120         if (ret) {
2121                 mlog_errno(ret);
2122                 goto out;
2123         }
2124
2125         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2126
2127         le32_add_cpu(&rb->rf_clusters, -1);
2128
2129         /*
2130          * check whether we need to restore the root refcount block if
2131          * there is no leaf extent block at atll.
2132          */
2133         if (!rb->rf_list.l_next_free_rec) {
2134                 BUG_ON(rb->rf_clusters);
2135
2136                 trace_ocfs2_restore_refcount_block(
2137                      (unsigned long long)ref_root_bh->b_blocknr);
2138
2139                 rb->rf_flags = 0;
2140                 rb->rf_parent = 0;
2141                 rb->rf_cpos = 0;
2142                 memset(&rb->rf_records, 0, sb->s_blocksize -
2143                        offsetof(struct ocfs2_refcount_block, rf_records));
2144                 rb->rf_records.rl_count =
2145                                 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2146         }
2147
2148         ocfs2_journal_dirty(handle, ref_root_bh);
2149
2150 out:
2151         return ret;
2152 }
2153
2154 int ocfs2_increase_refcount(handle_t *handle,
2155                             struct ocfs2_caching_info *ci,
2156                             struct buffer_head *ref_root_bh,
2157                             u64 cpos, u32 len,
2158                             struct ocfs2_alloc_context *meta_ac,
2159                             struct ocfs2_cached_dealloc_ctxt *dealloc)
2160 {
2161         return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2162                                          cpos, len, 1,
2163                                          meta_ac, dealloc);
2164 }
2165
2166 static int ocfs2_decrease_refcount_rec(handle_t *handle,
2167                                 struct ocfs2_caching_info *ci,
2168                                 struct buffer_head *ref_root_bh,
2169                                 struct buffer_head *ref_leaf_bh,
2170                                 int index, u64 cpos, unsigned int len,
2171                                 struct ocfs2_alloc_context *meta_ac,
2172                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2173 {
2174         int ret;
2175         struct ocfs2_refcount_block *rb =
2176                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2177         struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2178
2179         BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2180         BUG_ON(cpos + len >
2181                le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2182
2183         trace_ocfs2_decrease_refcount_rec(
2184                 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2185                 (unsigned long long)cpos, len);
2186
2187         if (cpos == le64_to_cpu(rec->r_cpos) &&
2188             len == le32_to_cpu(rec->r_clusters))
2189                 ret = ocfs2_change_refcount_rec(handle, ci,
2190                                                 ref_leaf_bh, index, 1, -1);
2191         else {
2192                 struct ocfs2_refcount_rec split = *rec;
2193                 split.r_cpos = cpu_to_le64(cpos);
2194                 split.r_clusters = cpu_to_le32(len);
2195
2196                 le32_add_cpu(&split.r_refcount, -1);
2197
2198                 ret = ocfs2_split_refcount_rec(handle, ci,
2199                                                ref_root_bh, ref_leaf_bh,
2200                                                &split, index, 1,
2201                                                meta_ac, dealloc);
2202         }
2203
2204         if (ret) {
2205                 mlog_errno(ret);
2206                 goto out;
2207         }
2208
2209         /* Remove the leaf refcount block if it contains no refcount record. */
2210         if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2211                 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2212                                                    ref_leaf_bh, meta_ac,
2213                                                    dealloc);
2214                 if (ret)
2215                         mlog_errno(ret);
2216         }
2217
2218 out:
2219         return ret;
2220 }
2221
2222 static int __ocfs2_decrease_refcount(handle_t *handle,
2223                                      struct ocfs2_caching_info *ci,
2224                                      struct buffer_head *ref_root_bh,
2225                                      u64 cpos, u32 len,
2226                                      struct ocfs2_alloc_context *meta_ac,
2227                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2228                                      int delete)
2229 {
2230         int ret = 0, index = 0;
2231         struct ocfs2_refcount_rec rec;
2232         unsigned int r_count = 0, r_len;
2233         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2234         struct buffer_head *ref_leaf_bh = NULL;
2235
2236         trace_ocfs2_decrease_refcount(
2237                 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2238                 (unsigned long long)cpos, len, delete);
2239
2240         while (len) {
2241                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2242                                              cpos, len, &rec, &index,
2243                                              &ref_leaf_bh);
2244                 if (ret) {
2245                         mlog_errno(ret);
2246                         goto out;
2247                 }
2248
2249                 r_count = le32_to_cpu(rec.r_refcount);
2250                 BUG_ON(r_count == 0);
2251                 if (!delete)
2252                         BUG_ON(r_count > 1);
2253
2254                 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2255                               le32_to_cpu(rec.r_clusters)) - cpos;
2256
2257                 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2258                                                   ref_leaf_bh, index,
2259                                                   cpos, r_len,
2260                                                   meta_ac, dealloc);
2261                 if (ret) {
2262                         mlog_errno(ret);
2263                         goto out;
2264                 }
2265
2266                 if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2267                         ret = ocfs2_cache_cluster_dealloc(dealloc,
2268                                           ocfs2_clusters_to_blocks(sb, cpos),
2269                                                           r_len);
2270                         if (ret) {
2271                                 mlog_errno(ret);
2272                                 goto out;
2273                         }
2274                 }
2275
2276                 cpos += r_len;
2277                 len -= r_len;
2278                 brelse(ref_leaf_bh);
2279                 ref_leaf_bh = NULL;
2280         }
2281
2282 out:
2283         brelse(ref_leaf_bh);
2284         return ret;
2285 }
2286
2287 /* Caller must hold refcount tree lock. */
2288 int ocfs2_decrease_refcount(struct inode *inode,
2289                             handle_t *handle, u32 cpos, u32 len,
2290                             struct ocfs2_alloc_context *meta_ac,
2291                             struct ocfs2_cached_dealloc_ctxt *dealloc,
2292                             int delete)
2293 {
2294         int ret;
2295         u64 ref_blkno;
2296         struct buffer_head *ref_root_bh = NULL;
2297         struct ocfs2_refcount_tree *tree;
2298
2299         BUG_ON(!ocfs2_is_refcount_inode(inode));
2300
2301         ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2302         if (ret) {
2303                 mlog_errno(ret);
2304                 goto out;
2305         }
2306
2307         ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2308         if (ret) {
2309                 mlog_errno(ret);
2310                 goto out;
2311         }
2312
2313         ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2314                                         &ref_root_bh);
2315         if (ret) {
2316                 mlog_errno(ret);
2317                 goto out;
2318         }
2319
2320         ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2321                                         cpos, len, meta_ac, dealloc, delete);
2322         if (ret)
2323                 mlog_errno(ret);
2324 out:
2325         brelse(ref_root_bh);
2326         return ret;
2327 }
2328
2329 /*
2330  * Mark the already-existing extent at cpos as refcounted for len clusters.
2331  * This adds the refcount extent flag.
2332  *
2333  * If the existing extent is larger than the request, initiate a
2334  * split. An attempt will be made at merging with adjacent extents.
2335  *
2336  * The caller is responsible for passing down meta_ac if we'll need it.
2337  */
2338 static int ocfs2_mark_extent_refcounted(struct inode *inode,
2339                                 struct ocfs2_extent_tree *et,
2340                                 handle_t *handle, u32 cpos,
2341                                 u32 len, u32 phys,
2342                                 struct ocfs2_alloc_context *meta_ac,
2343                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2344 {
2345         int ret;
2346
2347         trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
2348                                            cpos, len, phys);
2349
2350         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2351                 ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
2352                                   inode->i_ino);
2353                 goto out;
2354         }
2355
2356         ret = ocfs2_change_extent_flag(handle, et, cpos,
2357                                        len, phys, meta_ac, dealloc,
2358                                        OCFS2_EXT_REFCOUNTED, 0);
2359         if (ret)
2360                 mlog_errno(ret);
2361
2362 out:
2363         return ret;
2364 }
2365
2366 /*
2367  * Given some contiguous physical clusters, calculate what we need
2368  * for modifying their refcount.
2369  */
2370 static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2371                                             struct ocfs2_caching_info *ci,
2372                                             struct buffer_head *ref_root_bh,
2373                                             u64 start_cpos,
2374                                             u32 clusters,
2375                                             int *meta_add,
2376                                             int *credits)
2377 {
2378         int ret = 0, index, ref_blocks = 0, recs_add = 0;
2379         u64 cpos = start_cpos;
2380         struct ocfs2_refcount_block *rb;
2381         struct ocfs2_refcount_rec rec;
2382         struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2383         u32 len;
2384
2385         while (clusters) {
2386                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2387                                              cpos, clusters, &rec,
2388                                              &index, &ref_leaf_bh);
2389                 if (ret) {
2390                         mlog_errno(ret);
2391                         goto out;
2392                 }
2393
2394                 if (ref_leaf_bh != prev_bh) {
2395                         /*
2396                          * Now we encounter a new leaf block, so calculate
2397                          * whether we need to extend the old leaf.
2398                          */
2399                         if (prev_bh) {
2400                                 rb = (struct ocfs2_refcount_block *)
2401                                                         prev_bh->b_data;
2402
2403                                 if (le16_to_cpu(rb->rf_records.rl_used) +
2404                                     recs_add >
2405                                     le16_to_cpu(rb->rf_records.rl_count))
2406                                         ref_blocks++;
2407                         }
2408
2409                         recs_add = 0;
2410                         *credits += 1;
2411                         brelse(prev_bh);
2412                         prev_bh = ref_leaf_bh;
2413                         get_bh(prev_bh);
2414                 }
2415
2416                 trace_ocfs2_calc_refcount_meta_credits_iterate(
2417                                 recs_add, (unsigned long long)cpos, clusters,
2418                                 (unsigned long long)le64_to_cpu(rec.r_cpos),
2419                                 le32_to_cpu(rec.r_clusters),
2420                                 le32_to_cpu(rec.r_refcount), index);
2421
2422                 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2423                           le32_to_cpu(rec.r_clusters)) - cpos;
2424                 /*
2425                  * We record all the records which will be inserted to the
2426                  * same refcount block, so that we can tell exactly whether
2427                  * we need a new refcount block or not.
2428                  *
2429                  * If we will insert a new one, this is easy and only happens
2430                  * during adding refcounted flag to the extent, so we don't
2431                  * have a chance of spliting. We just need one record.
2432                  *
2433                  * If the refcount rec already exists, that would be a little
2434                  * complicated. we may have to:
2435                  * 1) split at the beginning if the start pos isn't aligned.
2436                  *    we need 1 more record in this case.
2437                  * 2) split int the end if the end pos isn't aligned.
2438                  *    we need 1 more record in this case.
2439                  * 3) split in the middle because of file system fragmentation.
2440                  *    we need 2 more records in this case(we can't detect this
2441                  *    beforehand, so always think of the worst case).
2442                  */
2443                 if (rec.r_refcount) {
2444                         recs_add += 2;
2445                         /* Check whether we need a split at the beginning. */
2446                         if (cpos == start_cpos &&
2447                             cpos != le64_to_cpu(rec.r_cpos))
2448                                 recs_add++;
2449
2450                         /* Check whether we need a split in the end. */
2451                         if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2452                             le32_to_cpu(rec.r_clusters))
2453                                 recs_add++;
2454                 } else
2455                         recs_add++;
2456
2457                 brelse(ref_leaf_bh);
2458                 ref_leaf_bh = NULL;
2459                 clusters -= len;
2460                 cpos += len;
2461         }
2462
2463         if (prev_bh) {
2464                 rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2465
2466                 if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
2467                     le16_to_cpu(rb->rf_records.rl_count))
2468                         ref_blocks++;
2469
2470                 *credits += 1;
2471         }
2472
2473         if (!ref_blocks)
2474                 goto out;
2475
2476         *meta_add += ref_blocks;
2477         *credits += ref_blocks;
2478
2479         /*
2480          * So we may need ref_blocks to insert into the tree.
2481          * That also means we need to change the b-tree and add that number
2482          * of records since we never merge them.
2483          * We need one more block for expansion since the new created leaf
2484          * block is also full and needs split.
2485          */
2486         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2487         if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2488                 struct ocfs2_extent_tree et;
2489
2490                 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2491                 *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2492                 *credits += ocfs2_calc_extend_credits(sb,
2493                                                       et.et_root_el);
2494         } else {
2495                 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2496                 *meta_add += 1;
2497         }
2498
2499 out:
2500
2501         trace_ocfs2_calc_refcount_meta_credits(
2502                 (unsigned long long)start_cpos, clusters,
2503                 *meta_add, *credits);
2504         brelse(ref_leaf_bh);
2505         brelse(prev_bh);
2506         return ret;
2507 }
2508
2509 /*
2510  * For refcount tree, we will decrease some contiguous clusters
2511  * refcount count, so just go through it to see how many blocks
2512  * we gonna touch and whether we need to create new blocks.
2513  *
2514  * Normally the refcount blocks store these refcount should be
2515  * contiguous also, so that we can get the number easily.
2516  * We will at most add split 2 refcount records and 2 more
2517  * refcount blocks, so just check it in a rough way.
2518  *
2519  * Caller must hold refcount tree lock.
2520  */
2521 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2522                                           u64 refcount_loc,
2523                                           u64 phys_blkno,
2524                                           u32 clusters,
2525                                           int *credits,
2526                                           int *ref_blocks)
2527 {
2528         int ret;
2529         struct buffer_head *ref_root_bh = NULL;
2530         struct ocfs2_refcount_tree *tree;
2531         u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2532
2533         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2534                 ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
2535                                   inode->i_ino);
2536                 goto out;
2537         }
2538
2539         BUG_ON(!ocfs2_is_refcount_inode(inode));
2540
2541         ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2542                                       refcount_loc, &tree);
2543         if (ret) {
2544                 mlog_errno(ret);
2545                 goto out;
2546         }
2547
2548         ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2549                                         &ref_root_bh);
2550         if (ret) {
2551                 mlog_errno(ret);
2552                 goto out;
2553         }
2554
2555         ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2556                                                &tree->rf_ci,
2557                                                ref_root_bh,
2558                                                start_cpos, clusters,
2559                                                ref_blocks, credits);
2560         if (ret) {
2561                 mlog_errno(ret);
2562                 goto out;
2563         }
2564
2565         trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
2566
2567 out:
2568         brelse(ref_root_bh);
2569         return ret;
2570 }
2571
2572 #define MAX_CONTIG_BYTES        1048576
2573
2574 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2575 {
2576         return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2577 }
2578
2579 static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2580 {
2581         return ~(ocfs2_cow_contig_clusters(sb) - 1);
2582 }
2583
2584 /*
2585  * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2586  * find an offset (start + (n * contig_clusters)) that is closest to cpos
2587  * while still being less than or equal to it.
2588  *
2589  * The goal is to break the extent at a multiple of contig_clusters.
2590  */
2591 static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2592                                                  unsigned int start,
2593                                                  unsigned int cpos)
2594 {
2595         BUG_ON(start > cpos);
2596
2597         return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2598 }
2599
2600 /*
2601  * Given a cluster count of len, pad it out so that it is a multiple
2602  * of contig_clusters.
2603  */
2604 static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2605                                                   unsigned int len)
2606 {
2607         unsigned int padded =
2608                 (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2609                 ocfs2_cow_contig_mask(sb);
2610
2611         /* Did we wrap? */
2612         if (padded < len)
2613                 padded = UINT_MAX;
2614
2615         return padded;
2616 }
2617
2618 /*
2619  * Calculate out the start and number of virtual clusters we need to to CoW.
2620  *
2621  * cpos is vitual start cluster position we want to do CoW in a
2622  * file and write_len is the cluster length.
2623  * max_cpos is the place where we want to stop CoW intentionally.
2624  *
2625  * Normal we will start CoW from the beginning of extent record cotaining cpos.
2626  * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2627  * get good I/O from the resulting extent tree.
2628  */
2629 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2630                                            struct ocfs2_extent_list *el,
2631                                            u32 cpos,
2632                                            u32 write_len,
2633                                            u32 max_cpos,
2634                                            u32 *cow_start,
2635                                            u32 *cow_len)
2636 {
2637         int ret = 0;
2638         int tree_height = le16_to_cpu(el->l_tree_depth), i;
2639         struct buffer_head *eb_bh = NULL;
2640         struct ocfs2_extent_block *eb = NULL;
2641         struct ocfs2_extent_rec *rec;
2642         unsigned int want_clusters, rec_end = 0;
2643         int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2644         int leaf_clusters;
2645
2646         BUG_ON(cpos + write_len > max_cpos);
2647
2648         if (tree_height > 0) {
2649                 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2650                 if (ret) {
2651                         mlog_errno(ret);
2652                         goto out;
2653                 }
2654
2655                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2656                 el = &eb->h_list;
2657
2658                 if (el->l_tree_depth) {
2659                         ret = ocfs2_error(inode->i_sb,
2660                                           "Inode %lu has non zero tree depth in leaf block %llu\n",
2661                                           inode->i_ino,
2662                                           (unsigned long long)eb_bh->b_blocknr);
2663                         goto out;
2664                 }
2665         }
2666
2667         *cow_len = 0;
2668         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2669                 rec = &el->l_recs[i];
2670
2671                 if (ocfs2_is_empty_extent(rec)) {
2672                         mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2673                                         "index %d\n", inode->i_ino, i);
2674                         continue;
2675                 }
2676
2677                 if (le32_to_cpu(rec->e_cpos) +
2678                     le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2679                         continue;
2680
2681                 if (*cow_len == 0) {
2682                         /*
2683                          * We should find a refcounted record in the
2684                          * first pass.
2685                          */
2686                         BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2687                         *cow_start = le32_to_cpu(rec->e_cpos);
2688                 }
2689
2690                 /*
2691                  * If we encounter a hole, a non-refcounted record or
2692                  * pass the max_cpos, stop the search.
2693                  */
2694                 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2695                     (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2696                     (max_cpos <= le32_to_cpu(rec->e_cpos)))
2697                         break;
2698
2699                 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2700                 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2701                 if (rec_end > max_cpos) {
2702                         rec_end = max_cpos;
2703                         leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2704                 }
2705
2706                 /*
2707                  * How many clusters do we actually need from
2708                  * this extent?  First we see how many we actually
2709                  * need to complete the write.  If that's smaller
2710                  * than contig_clusters, we try for contig_clusters.
2711                  */
2712                 if (!*cow_len)
2713                         want_clusters = write_len;
2714                 else
2715                         want_clusters = (cpos + write_len) -
2716                                 (*cow_start + *cow_len);
2717                 if (want_clusters < contig_clusters)
2718                         want_clusters = contig_clusters;
2719
2720                 /*
2721                  * If the write does not cover the whole extent, we
2722                  * need to calculate how we're going to split the extent.
2723                  * We try to do it on contig_clusters boundaries.
2724                  *
2725                  * Any extent smaller than contig_clusters will be
2726                  * CoWed in its entirety.
2727                  */
2728                 if (leaf_clusters <= contig_clusters)
2729                         *cow_len += leaf_clusters;
2730                 else if (*cow_len || (*cow_start == cpos)) {
2731                         /*
2732                          * This extent needs to be CoW'd from its
2733                          * beginning, so all we have to do is compute
2734                          * how many clusters to grab.  We align
2735                          * want_clusters to the edge of contig_clusters
2736                          * to get better I/O.
2737                          */
2738                         want_clusters = ocfs2_cow_align_length(inode->i_sb,
2739                                                                want_clusters);
2740
2741                         if (leaf_clusters < want_clusters)
2742                                 *cow_len += leaf_clusters;
2743                         else
2744                                 *cow_len += want_clusters;
2745                 } else if ((*cow_start + contig_clusters) >=
2746                            (cpos + write_len)) {
2747                         /*
2748                          * Breaking off contig_clusters at the front
2749                          * of the extent will cover our write.  That's
2750                          * easy.
2751                          */
2752                         *cow_len = contig_clusters;
2753                 } else if ((rec_end - cpos) <= contig_clusters) {
2754                         /*
2755                          * Breaking off contig_clusters at the tail of
2756                          * this extent will cover cpos.
2757                          */
2758                         *cow_start = rec_end - contig_clusters;
2759                         *cow_len = contig_clusters;
2760                 } else if ((rec_end - cpos) <= want_clusters) {
2761                         /*
2762                          * While we can't fit the entire write in this
2763                          * extent, we know that the write goes from cpos
2764                          * to the end of the extent.  Break that off.
2765                          * We try to break it at some multiple of
2766                          * contig_clusters from the front of the extent.
2767                          * Failing that (ie, cpos is within
2768                          * contig_clusters of the front), we'll CoW the
2769                          * entire extent.
2770                          */
2771                         *cow_start = ocfs2_cow_align_start(inode->i_sb,
2772                                                            *cow_start, cpos);
2773                         *cow_len = rec_end - *cow_start;
2774                 } else {
2775                         /*
2776                          * Ok, the entire write lives in the middle of
2777                          * this extent.  Let's try to slice the extent up
2778                          * nicely.  Optimally, our CoW region starts at
2779                          * m*contig_clusters from the beginning of the
2780                          * extent and goes for n*contig_clusters,
2781                          * covering the entire write.
2782                          */
2783                         *cow_start = ocfs2_cow_align_start(inode->i_sb,
2784                                                            *cow_start, cpos);
2785
2786                         want_clusters = (cpos + write_len) - *cow_start;
2787                         want_clusters = ocfs2_cow_align_length(inode->i_sb,
2788                                                                want_clusters);
2789                         if (*cow_start + want_clusters <= rec_end)
2790                                 *cow_len = want_clusters;
2791                         else
2792                                 *cow_len = rec_end - *cow_start;
2793                 }
2794
2795                 /* Have we covered our entire write yet? */
2796                 if ((*cow_start + *cow_len) >= (cpos + write_len))
2797                         break;
2798
2799                 /*
2800                  * If we reach the end of the extent block and don't get enough
2801                  * clusters, continue with the next extent block if possible.
2802                  */
2803                 if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2804                     eb && eb->h_next_leaf_blk) {
2805                         brelse(eb_bh);
2806                         eb_bh = NULL;
2807
2808                         ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2809                                                le64_to_cpu(eb->h_next_leaf_blk),
2810                                                &eb_bh);
2811                         if (ret) {
2812                                 mlog_errno(ret);
2813                                 goto out;
2814                         }
2815
2816                         eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2817                         el = &eb->h_list;
2818                         i = -1;
2819                 }
2820         }
2821
2822 out:
2823         brelse(eb_bh);
2824         return ret;
2825 }
2826
2827 /*
2828  * Prepare meta_ac, data_ac and calculate credits when we want to add some
2829  * num_clusters in data_tree "et" and change the refcount for the old
2830  * clusters(starting form p_cluster) in the refcount tree.
2831  *
2832  * Note:
2833  * 1. since we may split the old tree, so we at most will need num_clusters + 2
2834  *    more new leaf records.
2835  * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2836  *    just give data_ac = NULL.
2837  */
2838 static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2839                                         u32 p_cluster, u32 num_clusters,
2840                                         struct ocfs2_extent_tree *et,
2841                                         struct ocfs2_caching_info *ref_ci,
2842                                         struct buffer_head *ref_root_bh,
2843                                         struct ocfs2_alloc_context **meta_ac,
2844                                         struct ocfs2_alloc_context **data_ac,
2845                                         int *credits)
2846 {
2847         int ret = 0, meta_add = 0;
2848         int num_free_extents = ocfs2_num_free_extents(et);
2849
2850         if (num_free_extents < 0) {
2851                 ret = num_free_extents;
2852                 mlog_errno(ret);
2853                 goto out;
2854         }
2855
2856         if (num_free_extents < num_clusters + 2)
2857                 meta_add =
2858                         ocfs2_extend_meta_needed(et->et_root_el);
2859
2860         *credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
2861
2862         ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2863                                                p_cluster, num_clusters,
2864                                                &meta_add, credits);
2865         if (ret) {
2866                 mlog_errno(ret);
2867                 goto out;
2868         }
2869
2870         trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
2871         ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2872                                                 meta_ac);
2873         if (ret) {
2874                 mlog_errno(ret);
2875                 goto out;
2876         }
2877
2878         if (data_ac) {
2879                 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2880                                              data_ac);
2881                 if (ret)
2882                         mlog_errno(ret);
2883         }
2884
2885 out:
2886         if (ret) {
2887                 if (*meta_ac) {
2888                         ocfs2_free_alloc_context(*meta_ac);
2889                         *meta_ac = NULL;
2890                 }
2891         }
2892
2893         return ret;
2894 }
2895
2896 static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2897 {
2898         BUG_ON(buffer_dirty(bh));
2899
2900         clear_buffer_mapped(bh);
2901
2902         return 0;
2903 }
2904
2905 int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2906                                      struct inode *inode,
2907                                      u32 cpos, u32 old_cluster,
2908                                      u32 new_cluster, u32 new_len)
2909 {
2910         int ret = 0, partial;
2911         struct super_block *sb = inode->i_sb;
2912         u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2913         struct page *page;
2914         pgoff_t page_index;
2915         unsigned int from, to;
2916         loff_t offset, end, map_end;
2917         struct address_space *mapping = inode->i_mapping;
2918
2919         trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2920                                                new_cluster, new_len);
2921
2922         offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2923         end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2924         /*
2925          * We only duplicate pages until we reach the page contains i_size - 1.
2926          * So trim 'end' to i_size.
2927          */
2928         if (end > i_size_read(inode))
2929                 end = i_size_read(inode);
2930
2931         while (offset < end) {
2932                 page_index = offset >> PAGE_SHIFT;
2933                 map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
2934                 if (map_end > end)
2935                         map_end = end;
2936
2937                 /* from, to is the offset within the page. */
2938                 from = offset & (PAGE_SIZE - 1);
2939                 to = PAGE_SIZE;
2940                 if (map_end & (PAGE_SIZE - 1))
2941                         to = map_end & (PAGE_SIZE - 1);
2942
2943 retry:
2944                 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2945                 if (!page) {
2946                         ret = -ENOMEM;
2947                         mlog_errno(ret);
2948                         break;
2949                 }
2950
2951                 /*
2952                  * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
2953                  * page, so write it back.
2954                  */
2955                 if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
2956                         if (PageDirty(page)) {
2957                                 /*
2958                                  * write_on_page will unlock the page on return
2959                                  */
2960                                 ret = write_one_page(page);
2961                                 goto retry;
2962                         }
2963                 }
2964
2965                 if (!PageUptodate(page)) {
2966                         ret = block_read_full_page(page, ocfs2_get_block);
2967                         if (ret) {
2968                                 mlog_errno(ret);
2969                                 goto unlock;
2970                         }
2971                         lock_page(page);
2972                 }
2973
2974                 if (page_has_buffers(page)) {
2975                         ret = walk_page_buffers(handle, page_buffers(page),
2976                                                 from, to, &partial,
2977                                                 ocfs2_clear_cow_buffer);
2978                         if (ret) {
2979                                 mlog_errno(ret);
2980                                 goto unlock;
2981                         }
2982                 }
2983
2984                 ocfs2_map_and_dirty_page(inode,
2985                                          handle, from, to,
2986                                          page, 0, &new_block);
2987                 mark_page_accessed(page);
2988 unlock:
2989                 unlock_page(page);
2990                 put_page(page);
2991                 page = NULL;
2992                 offset = map_end;
2993                 if (ret)
2994                         break;
2995         }
2996
2997         return ret;
2998 }
2999
3000 int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3001                                     struct inode *inode,
3002                                     u32 cpos, u32 old_cluster,
3003                                     u32 new_cluster, u32 new_len)
3004 {
3005         int ret = 0;
3006         struct super_block *sb = inode->i_sb;
3007         struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3008         int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3009         u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3010         u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
3011         struct ocfs2_super *osb = OCFS2_SB(sb);
3012         struct buffer_head *old_bh = NULL;
3013         struct buffer_head *new_bh = NULL;
3014
3015         trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
3016                                                new_cluster, new_len);
3017
3018         for (i = 0; i < blocks; i++, old_block++, new_block++) {
3019                 new_bh = sb_getblk(osb->sb, new_block);
3020                 if (new_bh == NULL) {
3021                         ret = -ENOMEM;
3022                         mlog_errno(ret);
3023                         break;
3024                 }
3025
3026                 ocfs2_set_new_buffer_uptodate(ci, new_bh);
3027
3028                 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
3029                 if (ret) {
3030                         mlog_errno(ret);
3031                         break;
3032                 }
3033
3034                 ret = ocfs2_journal_access(handle, ci, new_bh,
3035                                            OCFS2_JOURNAL_ACCESS_CREATE);
3036                 if (ret) {
3037                         mlog_errno(ret);
3038                         break;
3039                 }
3040
3041                 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3042                 ocfs2_journal_dirty(handle, new_bh);
3043
3044                 brelse(new_bh);
3045                 brelse(old_bh);
3046                 new_bh = NULL;
3047                 old_bh = NULL;
3048         }
3049
3050         brelse(new_bh);
3051         brelse(old_bh);
3052         return ret;
3053 }
3054
3055 static int ocfs2_clear_ext_refcount(handle_t *handle,
3056                                     struct ocfs2_extent_tree *et,
3057                                     u32 cpos, u32 p_cluster, u32 len,
3058                                     unsigned int ext_flags,
3059                                     struct ocfs2_alloc_context *meta_ac,
3060                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
3061 {
3062         int ret, index;
3063         struct ocfs2_extent_rec replace_rec;
3064         struct ocfs2_path *path = NULL;
3065         struct ocfs2_extent_list *el;
3066         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
3067         u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
3068
3069         trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
3070                                        cpos, len, p_cluster, ext_flags);
3071
3072         memset(&replace_rec, 0, sizeof(replace_rec));
3073         replace_rec.e_cpos = cpu_to_le32(cpos);
3074         replace_rec.e_leaf_clusters = cpu_to_le16(len);
3075         replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
3076                                                                    p_cluster));
3077         replace_rec.e_flags = ext_flags;
3078         replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
3079
3080         path = ocfs2_new_path_from_et(et);
3081         if (!path) {
3082                 ret = -ENOMEM;
3083                 mlog_errno(ret);
3084                 goto out;
3085         }
3086
3087         ret = ocfs2_find_path(et->et_ci, path, cpos);
3088         if (ret) {
3089                 mlog_errno(ret);
3090                 goto out;
3091         }
3092
3093         el = path_leaf_el(path);
3094
3095         index = ocfs2_search_extent_list(el, cpos);
3096         if (index == -1) {
3097                 ret = ocfs2_error(sb,
3098                                   "Inode %llu has an extent at cpos %u which can no longer be found\n",
3099                                   (unsigned long long)ino, cpos);
3100                 goto out;
3101         }
3102
3103         ret = ocfs2_split_extent(handle, et, path, index,
3104                                  &replace_rec, meta_ac, dealloc);
3105         if (ret)
3106                 mlog_errno(ret);
3107
3108 out:
3109         ocfs2_free_path(path);
3110         return ret;
3111 }
3112
3113 static int ocfs2_replace_clusters(handle_t *handle,
3114                                   struct ocfs2_cow_context *context,
3115                                   u32 cpos, u32 old,
3116                                   u32 new, u32 len,
3117                                   unsigned int ext_flags)
3118 {
3119         int ret;
3120         struct ocfs2_caching_info *ci = context->data_et.et_ci;
3121         u64 ino = ocfs2_metadata_cache_owner(ci);
3122
3123         trace_ocfs2_replace_clusters((unsigned long long)ino,
3124                                      cpos, old, new, len, ext_flags);
3125
3126         /*If the old clusters is unwritten, no need to duplicate. */
3127         if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3128                 ret = context->cow_duplicate_clusters(handle, context->inode,
3129                                                       cpos, old, new, len);
3130                 if (ret) {
3131                         mlog_errno(ret);
3132                         goto out;
3133                 }
3134         }
3135
3136         ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3137                                        cpos, new, len, ext_flags,
3138                                        context->meta_ac, &context->dealloc);
3139         if (ret)
3140                 mlog_errno(ret);
3141 out:
3142         return ret;
3143 }
3144
3145 int ocfs2_cow_sync_writeback(struct super_block *sb,
3146                              struct inode *inode,
3147                              u32 cpos, u32 num_clusters)
3148 {
3149         int ret = 0;
3150         loff_t offset, end, map_end;
3151         pgoff_t page_index;
3152         struct page *page;
3153
3154         if (ocfs2_should_order_data(inode))
3155                 return 0;
3156
3157         offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3158         end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3159
3160         ret = filemap_fdatawrite_range(inode->i_mapping,
3161                                        offset, end - 1);
3162         if (ret < 0) {
3163                 mlog_errno(ret);
3164                 return ret;
3165         }
3166
3167         while (offset < end) {
3168                 page_index = offset >> PAGE_SHIFT;
3169                 map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
3170                 if (map_end > end)
3171                         map_end = end;
3172
3173                 page = find_or_create_page(inode->i_mapping,
3174                                            page_index, GFP_NOFS);
3175                 BUG_ON(!page);
3176
3177                 wait_on_page_writeback(page);
3178                 if (PageError(page)) {
3179                         ret = -EIO;
3180                         mlog_errno(ret);
3181                 } else
3182                         mark_page_accessed(page);
3183
3184                 unlock_page(page);
3185                 put_page(page);
3186                 page = NULL;
3187                 offset = map_end;
3188                 if (ret)
3189                         break;
3190         }
3191
3192         return ret;
3193 }
3194
3195 static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3196                                  u32 v_cluster, u32 *p_cluster,
3197                                  u32 *num_clusters,
3198                                  unsigned int *extent_flags)
3199 {
3200         return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3201                                   num_clusters, extent_flags);
3202 }
3203
3204 static int ocfs2_make_clusters_writable(struct super_block *sb,
3205                                         struct ocfs2_cow_context *context,
3206                                         u32 cpos, u32 p_cluster,
3207                                         u32 num_clusters, unsigned int e_flags)
3208 {
3209         int ret, delete, index, credits =  0;
3210         u32 new_bit, new_len, orig_num_clusters;
3211         unsigned int set_len;
3212         struct ocfs2_super *osb = OCFS2_SB(sb);
3213         handle_t *handle;
3214         struct buffer_head *ref_leaf_bh = NULL;
3215         struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3216         struct ocfs2_refcount_rec rec;
3217
3218         trace_ocfs2_make_clusters_writable(cpos, p_cluster,
3219                                            num_clusters, e_flags);
3220
3221         ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3222                                              &context->data_et,
3223                                              ref_ci,
3224                                              context->ref_root_bh,
3225                                              &context->meta_ac,
3226                                              &context->data_ac, &credits);
3227         if (ret) {
3228                 mlog_errno(ret);
3229                 return ret;
3230         }
3231
3232         if (context->post_refcount)
3233                 credits += context->post_refcount->credits;
3234
3235         credits += context->extra_credits;
3236         handle = ocfs2_start_trans(osb, credits);
3237         if (IS_ERR(handle)) {
3238                 ret = PTR_ERR(handle);
3239                 mlog_errno(ret);
3240                 goto out;
3241         }
3242
3243         orig_num_clusters = num_clusters;
3244
3245         while (num_clusters) {
3246                 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3247                                              p_cluster, num_clusters,
3248                                              &rec, &index, &ref_leaf_bh);
3249                 if (ret) {
3250                         mlog_errno(ret);
3251                         goto out_commit;
3252                 }
3253
3254                 BUG_ON(!rec.r_refcount);
3255                 set_len = min((u64)p_cluster + num_clusters,
3256                               le64_to_cpu(rec.r_cpos) +
3257                               le32_to_cpu(rec.r_clusters)) - p_cluster;
3258
3259                 /*
3260                  * There are many different situation here.
3261                  * 1. If refcount == 1, remove the flag and don't COW.
3262                  * 2. If refcount > 1, allocate clusters.
3263                  *    Here we may not allocate r_len once at a time, so continue
3264                  *    until we reach num_clusters.
3265                  */
3266                 if (le32_to_cpu(rec.r_refcount) == 1) {
3267                         delete = 0;
3268                         ret = ocfs2_clear_ext_refcount(handle,
3269                                                        &context->data_et,
3270                                                        cpos, p_cluster,
3271                                                        set_len, e_flags,
3272                                                        context->meta_ac,
3273                                                        &context->dealloc);
3274                         if (ret) {
3275                                 mlog_errno(ret);
3276                                 goto out_commit;
3277                         }
3278                 } else {
3279                         delete = 1;
3280
3281                         ret = __ocfs2_claim_clusters(handle,
3282                                                      context->data_ac,
3283                                                      1, set_len,
3284                                                      &new_bit, &new_len);
3285                         if (ret) {
3286                                 mlog_errno(ret);
3287                                 goto out_commit;
3288                         }
3289
3290                         ret = ocfs2_replace_clusters(handle, context,
3291                                                      cpos, p_cluster, new_bit,
3292                                                      new_len, e_flags);
3293                         if (ret) {
3294                                 mlog_errno(ret);
3295                                 goto out_commit;
3296                         }
3297                         set_len = new_len;
3298                 }
3299
3300                 ret = __ocfs2_decrease_refcount(handle, ref_ci,
3301                                                 context->ref_root_bh,
3302                                                 p_cluster, set_len,
3303                                                 context->meta_ac,
3304                                                 &context->dealloc, delete);
3305                 if (ret) {
3306                         mlog_errno(ret);
3307                         goto out_commit;
3308                 }
3309
3310                 cpos += set_len;
3311                 p_cluster += set_len;
3312                 num_clusters -= set_len;
3313                 brelse(ref_leaf_bh);
3314                 ref_leaf_bh = NULL;
3315         }
3316
3317         /* handle any post_cow action. */
3318         if (context->post_refcount && context->post_refcount->func) {
3319                 ret = context->post_refcount->func(context->inode, handle,
3320                                                 context->post_refcount->para);
3321                 if (ret) {
3322                         mlog_errno(ret);
3323                         goto out_commit;
3324                 }
3325         }
3326
3327         /*
3328          * Here we should write the new page out first if we are
3329          * in write-back mode.
3330          */
3331         if (context->get_clusters == ocfs2_di_get_clusters) {
3332                 ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3333                                                orig_num_clusters);
3334                 if (ret)
3335                         mlog_errno(ret);
3336         }
3337
3338 out_commit:
3339         ocfs2_commit_trans(osb, handle);
3340
3341 out:
3342         if (context->data_ac) {
3343                 ocfs2_free_alloc_context(context->data_ac);
3344                 context->data_ac = NULL;
3345         }
3346         if (context->meta_ac) {
3347                 ocfs2_free_alloc_context(context->meta_ac);
3348                 context->meta_ac = NULL;
3349         }
3350         brelse(ref_leaf_bh);
3351
3352         return ret;
3353 }
3354
3355 static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3356 {
3357         int ret = 0;
3358         struct inode *inode = context->inode;
3359         u32 cow_start = context->cow_start, cow_len = context->cow_len;
3360         u32 p_cluster, num_clusters;
3361         unsigned int ext_flags;
3362         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3363
3364         if (!ocfs2_refcount_tree(osb)) {
3365                 return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
3366                                    inode->i_ino);
3367         }
3368
3369         ocfs2_init_dealloc_ctxt(&context->dealloc);
3370
3371         while (cow_len) {
3372                 ret = context->get_clusters(context, cow_start, &p_cluster,
3373                                             &num_clusters, &ext_flags);
3374                 if (ret) {
3375                         mlog_errno(ret);
3376                         break;
3377                 }
3378
3379                 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3380
3381                 if (cow_len < num_clusters)
3382                         num_clusters = cow_len;
3383
3384                 ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3385                                                    cow_start, p_cluster,
3386                                                    num_clusters, ext_flags);
3387                 if (ret) {
3388                         mlog_errno(ret);
3389                         break;
3390                 }
3391
3392                 cow_len -= num_clusters;
3393                 cow_start += num_clusters;
3394         }
3395
3396         if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3397                 ocfs2_schedule_truncate_log_flush(osb, 1);
3398                 ocfs2_run_deallocs(osb, &context->dealloc);
3399         }
3400
3401         return ret;
3402 }
3403
3404 /*
3405  * Starting at cpos, try to CoW write_len clusters.  Don't CoW
3406  * past max_cpos.  This will stop when it runs into a hole or an
3407  * unrefcounted extent.
3408  */
3409 static int ocfs2_refcount_cow_hunk(struct inode *inode,
3410                                    struct buffer_head *di_bh,
3411                                    u32 cpos, u32 write_len, u32 max_cpos)
3412 {
3413         int ret;
3414         u32 cow_start = 0, cow_len = 0;
3415         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3416         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3417         struct buffer_head *ref_root_bh = NULL;
3418         struct ocfs2_refcount_tree *ref_tree;
3419         struct ocfs2_cow_context *context = NULL;
3420
3421         BUG_ON(!ocfs2_is_refcount_inode(inode));
3422
3423         ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3424                                               cpos, write_len, max_cpos,
3425                                               &cow_start, &cow_len);
3426         if (ret) {
3427                 mlog_errno(ret);
3428                 goto out;
3429         }
3430
3431         trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
3432                                       cpos, write_len, max_cpos,
3433                                       cow_start, cow_len);
3434
3435         BUG_ON(cow_len == 0);
3436
3437         context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3438         if (!context) {
3439                 ret = -ENOMEM;
3440                 mlog_errno(ret);
3441                 goto out;
3442         }
3443
3444         ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3445                                        1, &ref_tree, &ref_root_bh);
3446         if (ret) {
3447                 mlog_errno(ret);
3448                 goto out;
3449         }
3450
3451         context->inode = inode;
3452         context->cow_start = cow_start;
3453         context->cow_len = cow_len;
3454         context->ref_tree = ref_tree;
3455         context->ref_root_bh = ref_root_bh;
3456         context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3457         context->get_clusters = ocfs2_di_get_clusters;
3458
3459         ocfs2_init_dinode_extent_tree(&context->data_et,
3460                                       INODE_CACHE(inode), di_bh);
3461
3462         ret = ocfs2_replace_cow(context);
3463         if (ret)
3464                 mlog_errno(ret);
3465
3466         /*
3467          * truncate the extent map here since no matter whether we meet with
3468          * any error during the action, we shouldn't trust cached extent map
3469          * any more.
3470          */
3471         ocfs2_extent_map_trunc(inode, cow_start);
3472
3473         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3474         brelse(ref_root_bh);
3475 out:
3476         kfree(context);
3477         return ret;
3478 }
3479
3480 /*
3481  * CoW any and all clusters between cpos and cpos+write_len.
3482  * Don't CoW past max_cpos.  If this returns successfully, all
3483  * clusters between cpos and cpos+write_len are safe to modify.
3484  */
3485 int ocfs2_refcount_cow(struct inode *inode,
3486                        struct buffer_head *di_bh,
3487                        u32 cpos, u32 write_len, u32 max_cpos)
3488 {
3489         int ret = 0;
3490         u32 p_cluster, num_clusters;
3491         unsigned int ext_flags;
3492
3493         while (write_len) {
3494                 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3495                                          &num_clusters, &ext_flags);
3496                 if (ret) {
3497                         mlog_errno(ret);
3498                         break;
3499                 }
3500
3501                 if (write_len < num_clusters)
3502                         num_clusters = write_len;
3503
3504                 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3505                         ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3506                                                       num_clusters, max_cpos);
3507                         if (ret) {
3508                                 mlog_errno(ret);
3509                                 break;
3510                         }
3511                 }
3512
3513                 write_len -= num_clusters;
3514                 cpos += num_clusters;
3515         }
3516
3517         return ret;
3518 }
3519
3520 static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3521                                           u32 v_cluster, u32 *p_cluster,
3522                                           u32 *num_clusters,
3523                                           unsigned int *extent_flags)
3524 {
3525         struct inode *inode = context->inode;
3526         struct ocfs2_xattr_value_root *xv = context->cow_object;
3527
3528         return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3529                                         num_clusters, &xv->xr_list,
3530                                         extent_flags);
3531 }
3532
3533 /*
3534  * Given a xattr value root, calculate the most meta/credits we need for
3535  * refcount tree change if we truncate it to 0.
3536  */
3537 int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3538                                        struct ocfs2_caching_info *ref_ci,
3539                                        struct buffer_head *ref_root_bh,
3540                                        struct ocfs2_xattr_value_root *xv,
3541                                        int *meta_add, int *credits)
3542 {
3543         int ret = 0, index, ref_blocks = 0;
3544         u32 p_cluster, num_clusters;
3545         u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3546         struct ocfs2_refcount_block *rb;
3547         struct ocfs2_refcount_rec rec;
3548         struct buffer_head *ref_leaf_bh = NULL;
3549
3550         while (cpos < clusters) {
3551                 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3552                                                &num_clusters, &xv->xr_list,
3553                                                NULL);
3554                 if (ret) {
3555                         mlog_errno(ret);
3556                         goto out;
3557                 }
3558
3559                 cpos += num_clusters;
3560
3561                 while (num_clusters) {
3562                         ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3563                                                      p_cluster, num_clusters,
3564                                                      &rec, &index,
3565                                                      &ref_leaf_bh);
3566                         if (ret) {
3567                                 mlog_errno(ret);
3568                                 goto out;
3569                         }
3570
3571                         BUG_ON(!rec.r_refcount);
3572
3573                         rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3574
3575                         /*
3576                          * We really don't know whether the other clusters is in
3577                          * this refcount block or not, so just take the worst
3578                          * case that all the clusters are in this block and each
3579                          * one will split a refcount rec, so totally we need
3580                          * clusters * 2 new refcount rec.
3581                          */
3582                         if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3583                             le16_to_cpu(rb->rf_records.rl_count))
3584                                 ref_blocks++;
3585
3586                         *credits += 1;
3587                         brelse(ref_leaf_bh);
3588                         ref_leaf_bh = NULL;
3589
3590                         if (num_clusters <= le32_to_cpu(rec.r_clusters))
3591                                 break;
3592                         else
3593                                 num_clusters -= le32_to_cpu(rec.r_clusters);
3594                         p_cluster += num_clusters;
3595                 }
3596         }
3597
3598         *meta_add += ref_blocks;
3599         if (!ref_blocks)
3600                 goto out;
3601
3602         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3603         if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3604                 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3605         else {
3606                 struct ocfs2_extent_tree et;
3607
3608                 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3609                 *credits += ocfs2_calc_extend_credits(inode->i_sb,
3610                                                       et.et_root_el);
3611         }
3612
3613 out:
3614         brelse(ref_leaf_bh);
3615         return ret;
3616 }
3617
3618 /*
3619  * Do CoW for xattr.
3620  */
3621 int ocfs2_refcount_cow_xattr(struct inode *inode,
3622                              struct ocfs2_dinode *di,
3623                              struct ocfs2_xattr_value_buf *vb,
3624                              struct ocfs2_refcount_tree *ref_tree,
3625                              struct buffer_head *ref_root_bh,
3626                              u32 cpos, u32 write_len,
3627                              struct ocfs2_post_refcount *post)
3628 {
3629         int ret;
3630         struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3631         struct ocfs2_cow_context *context = NULL;
3632         u32 cow_start, cow_len;
3633
3634         BUG_ON(!ocfs2_is_refcount_inode(inode));
3635
3636         ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3637                                               cpos, write_len, UINT_MAX,
3638                                               &cow_start, &cow_len);
3639         if (ret) {
3640                 mlog_errno(ret);
3641                 goto out;
3642         }
3643
3644         BUG_ON(cow_len == 0);
3645
3646         context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3647         if (!context) {
3648                 ret = -ENOMEM;
3649                 mlog_errno(ret);
3650                 goto out;
3651         }
3652
3653         context->inode = inode;
3654         context->cow_start = cow_start;
3655         context->cow_len = cow_len;
3656         context->ref_tree = ref_tree;
3657         context->ref_root_bh = ref_root_bh;
3658         context->cow_object = xv;
3659
3660         context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3661         /* We need the extra credits for duplicate_clusters by jbd. */
3662         context->extra_credits =
3663                 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3664         context->get_clusters = ocfs2_xattr_value_get_clusters;
3665         context->post_refcount = post;
3666
3667         ocfs2_init_xattr_value_extent_tree(&context->data_et,
3668                                            INODE_CACHE(inode), vb);
3669
3670         ret = ocfs2_replace_cow(context);
3671         if (ret)
3672                 mlog_errno(ret);
3673
3674 out:
3675         kfree(context);
3676         return ret;
3677 }
3678
3679 /*
3680  * Insert a new extent into refcount tree and mark a extent rec
3681  * as refcounted in the dinode tree.
3682  */
3683 int ocfs2_add_refcount_flag(struct inode *inode,
3684                             struct ocfs2_extent_tree *data_et,
3685                             struct ocfs2_caching_info *ref_ci,
3686                             struct buffer_head *ref_root_bh,
3687                             u32 cpos, u32 p_cluster, u32 num_clusters,
3688                             struct ocfs2_cached_dealloc_ctxt *dealloc,
3689                             struct ocfs2_post_refcount *post)
3690 {
3691         int ret;
3692         handle_t *handle;
3693         int credits = 1, ref_blocks = 0;
3694         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3695         struct ocfs2_alloc_context *meta_ac = NULL;
3696
3697         /* We need to be able to handle at least an extent tree split. */
3698         ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
3699
3700         ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3701                                                ref_ci, ref_root_bh,
3702                                                p_cluster, num_clusters,
3703                                                &ref_blocks, &credits);
3704         if (ret) {
3705                 mlog_errno(ret);
3706                 goto out;
3707         }
3708
3709         trace_ocfs2_add_refcount_flag(ref_blocks, credits);
3710
3711         if (ref_blocks) {
3712                 ret = ocfs2_reserve_new_metadata_blocks(osb,
3713                                                         ref_blocks, &meta_ac);
3714                 if (ret) {
3715                         mlog_errno(ret);
3716                         goto out;
3717                 }
3718         }
3719
3720         if (post)
3721                 credits += post->credits;
3722
3723         handle = ocfs2_start_trans(osb, credits);
3724         if (IS_ERR(handle)) {
3725                 ret = PTR_ERR(handle);
3726                 mlog_errno(ret);
3727                 goto out;
3728         }
3729
3730         ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3731                                            cpos, num_clusters, p_cluster,
3732                                            meta_ac, dealloc);
3733         if (ret) {
3734                 mlog_errno(ret);
3735                 goto out_commit;
3736         }
3737
3738         ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3739                                         p_cluster, num_clusters, 0,
3740                                         meta_ac, dealloc);
3741         if (ret) {
3742                 mlog_errno(ret);
3743                 goto out_commit;
3744         }
3745
3746         if (post && post->func) {
3747                 ret = post->func(inode, handle, post->para);
3748                 if (ret)
3749                         mlog_errno(ret);
3750         }
3751
3752 out_commit:
3753         ocfs2_commit_trans(osb, handle);
3754 out:
3755         if (meta_ac)
3756                 ocfs2_free_alloc_context(meta_ac);
3757         return ret;
3758 }
3759
3760 static int ocfs2_change_ctime(struct inode *inode,
3761                               struct buffer_head *di_bh)
3762 {
3763         int ret;
3764         handle_t *handle;
3765         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3766
3767         handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3768                                    OCFS2_INODE_UPDATE_CREDITS);
3769         if (IS_ERR(handle)) {
3770                 ret = PTR_ERR(handle);
3771                 mlog_errno(ret);
3772                 goto out;
3773         }
3774
3775         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3776                                       OCFS2_JOURNAL_ACCESS_WRITE);
3777         if (ret) {
3778                 mlog_errno(ret);
3779                 goto out_commit;
3780         }
3781
3782         inode->i_ctime = current_time(inode);
3783         di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3784         di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3785
3786         ocfs2_journal_dirty(handle, di_bh);
3787
3788 out_commit:
3789         ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3790 out:
3791         return ret;
3792 }
3793
3794 static int ocfs2_attach_refcount_tree(struct inode *inode,
3795                                       struct buffer_head *di_bh)
3796 {
3797         int ret, data_changed = 0;
3798         struct buffer_head *ref_root_bh = NULL;
3799         struct ocfs2_inode_info *oi = OCFS2_I(inode);
3800         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3801         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3802         struct ocfs2_refcount_tree *ref_tree;
3803         unsigned int ext_flags;
3804         loff_t size;
3805         u32 cpos, num_clusters, clusters, p_cluster;
3806         struct ocfs2_cached_dealloc_ctxt dealloc;
3807         struct ocfs2_extent_tree di_et;
3808
3809         ocfs2_init_dealloc_ctxt(&dealloc);
3810
3811         if (!ocfs2_is_refcount_inode(inode)) {
3812                 ret = ocfs2_create_refcount_tree(inode, di_bh);
3813                 if (ret) {
3814                         mlog_errno(ret);
3815                         goto out;
3816                 }
3817         }
3818
3819         BUG_ON(!di->i_refcount_loc);
3820         ret = ocfs2_lock_refcount_tree(osb,
3821                                        le64_to_cpu(di->i_refcount_loc), 1,
3822                                        &ref_tree, &ref_root_bh);
3823         if (ret) {
3824                 mlog_errno(ret);
3825                 goto out;
3826         }
3827
3828         if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3829                 goto attach_xattr;
3830
3831         ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3832
3833         size = i_size_read(inode);
3834         clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3835
3836         cpos = 0;
3837         while (cpos < clusters) {
3838                 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3839                                          &num_clusters, &ext_flags);
3840                 if (ret) {
3841                         mlog_errno(ret);
3842                         goto unlock;
3843                 }
3844                 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3845                         ret = ocfs2_add_refcount_flag(inode, &di_et,
3846                                                       &ref_tree->rf_ci,
3847                                                       ref_root_bh, cpos,
3848                                                       p_cluster, num_clusters,
3849                                                       &dealloc, NULL);
3850                         if (ret) {
3851                                 mlog_errno(ret);
3852                                 goto unlock;
3853                         }
3854
3855                         data_changed = 1;
3856                 }
3857                 cpos += num_clusters;
3858         }
3859
3860 attach_xattr:
3861         if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3862                 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3863                                                        &ref_tree->rf_ci,
3864                                                        ref_root_bh,
3865                                                        &dealloc);
3866                 if (ret) {
3867                         mlog_errno(ret);
3868                         goto unlock;
3869                 }
3870         }
3871
3872         if (data_changed) {
3873                 ret = ocfs2_change_ctime(inode, di_bh);
3874                 if (ret)
3875                         mlog_errno(ret);
3876         }
3877
3878 unlock:
3879         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3880         brelse(ref_root_bh);
3881
3882         if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3883                 ocfs2_schedule_truncate_log_flush(osb, 1);
3884                 ocfs2_run_deallocs(osb, &dealloc);
3885         }
3886 out:
3887         /*
3888          * Empty the extent map so that we may get the right extent
3889          * record from the disk.
3890          */
3891         ocfs2_extent_map_trunc(inode, 0);
3892
3893         return ret;
3894 }
3895
3896 static int ocfs2_add_refcounted_extent(struct inode *inode,
3897                                    struct ocfs2_extent_tree *et,
3898                                    struct ocfs2_caching_info *ref_ci,
3899                                    struct buffer_head *ref_root_bh,
3900                                    u32 cpos, u32 p_cluster, u32 num_clusters,
3901                                    unsigned int ext_flags,
3902                                    struct ocfs2_cached_dealloc_ctxt *dealloc)
3903 {
3904         int ret;
3905         handle_t *handle;
3906         int credits = 0;
3907         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3908         struct ocfs2_alloc_context *meta_ac = NULL;
3909
3910         ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3911                                              p_cluster, num_clusters,
3912                                              et, ref_ci,
3913                                              ref_root_bh, &meta_ac,
3914                                              NULL, &credits);
3915         if (ret) {
3916                 mlog_errno(ret);
3917                 goto out;
3918         }
3919
3920         handle = ocfs2_start_trans(osb, credits);
3921         if (IS_ERR(handle)) {
3922                 ret = PTR_ERR(handle);
3923                 mlog_errno(ret);
3924                 goto out;
3925         }
3926
3927         ret = ocfs2_insert_extent(handle, et, cpos,
3928                         ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3929                         num_clusters, ext_flags, meta_ac);
3930         if (ret) {
3931                 mlog_errno(ret);
3932                 goto out_commit;
3933         }
3934
3935         ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3936                                       p_cluster, num_clusters,
3937                                       meta_ac, dealloc);
3938         if (ret) {
3939                 mlog_errno(ret);
3940                 goto out_commit;
3941         }
3942
3943         ret = dquot_alloc_space_nodirty(inode,
3944                 ocfs2_clusters_to_bytes(osb->sb, num_clusters));
3945         if (ret)
3946                 mlog_errno(ret);
3947
3948 out_commit:
3949         ocfs2_commit_trans(osb, handle);
3950 out:
3951         if (meta_ac)
3952                 ocfs2_free_alloc_context(meta_ac);
3953         return ret;
3954 }
3955
3956 static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3957                                        struct buffer_head *s_bh,
3958                                        struct inode *t_inode,
3959                                        struct buffer_head *t_bh)
3960 {
3961         int ret;
3962         handle_t *handle;
3963         struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3964         struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3965         struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
3966
3967         BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
3968
3969         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
3970         if (IS_ERR(handle)) {
3971                 ret = PTR_ERR(handle);
3972                 mlog_errno(ret);
3973                 goto out;
3974         }
3975
3976         ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3977                                       OCFS2_JOURNAL_ACCESS_WRITE);
3978         if (ret) {
3979                 mlog_errno(ret);
3980                 goto out_commit;
3981         }
3982
3983         t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
3984         memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
3985                le16_to_cpu(s_di->id2.i_data.id_count));
3986         spin_lock(&OCFS2_I(t_inode)->ip_lock);
3987         OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
3988         t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
3989         spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3990
3991         ocfs2_journal_dirty(handle, t_bh);
3992
3993 out_commit:
3994         ocfs2_commit_trans(osb, handle);
3995 out:
3996         return ret;
3997 }
3998
3999 static int ocfs2_duplicate_extent_list(struct inode *s_inode,
4000                                 struct inode *t_inode,
4001                                 struct buffer_head *t_bh,
4002                                 struct ocfs2_caching_info *ref_ci,
4003                                 struct buffer_head *ref_root_bh,
4004                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
4005 {
4006         int ret = 0;
4007         u32 p_cluster, num_clusters, clusters, cpos;
4008         loff_t size;
4009         unsigned int ext_flags;
4010         struct ocfs2_extent_tree et;
4011
4012         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
4013
4014         size = i_size_read(s_inode);
4015         clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
4016
4017         cpos = 0;
4018         while (cpos < clusters) {
4019                 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4020                                          &num_clusters, &ext_flags);
4021                 if (ret) {
4022                         mlog_errno(ret);
4023                         goto out;
4024                 }
4025                 if (p_cluster) {
4026                         ret = ocfs2_add_refcounted_extent(t_inode, &et,
4027                                                           ref_ci, ref_root_bh,
4028                                                           cpos, p_cluster,
4029                                                           num_clusters,
4030                                                           ext_flags,
4031                                                           dealloc);
4032                         if (ret) {
4033                                 mlog_errno(ret);
4034                                 goto out;
4035                         }
4036                 }
4037
4038                 cpos += num_clusters;
4039         }
4040
4041 out:
4042         return ret;
4043 }
4044
4045 /*
4046  * change the new file's attributes to the src.
4047  *
4048  * reflink creates a snapshot of a file, that means the attributes
4049  * must be identical except for three exceptions - nlink, ino, and ctime.
4050  */
4051 static int ocfs2_complete_reflink(struct inode *s_inode,
4052                                   struct buffer_head *s_bh,
4053                                   struct inode *t_inode,
4054                                   struct buffer_head *t_bh,
4055                                   bool preserve)
4056 {
4057         int ret;
4058         handle_t *handle;
4059         struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4060         struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
4061         loff_t size = i_size_read(s_inode);
4062
4063         handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
4064                                    OCFS2_INODE_UPDATE_CREDITS);
4065         if (IS_ERR(handle)) {
4066                 ret = PTR_ERR(handle);
4067                 mlog_errno(ret);
4068                 return ret;
4069         }
4070
4071         ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4072                                       OCFS2_JOURNAL_ACCESS_WRITE);
4073         if (ret) {
4074                 mlog_errno(ret);
4075                 goto out_commit;
4076         }
4077
4078         spin_lock(&OCFS2_I(t_inode)->ip_lock);
4079         OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
4080         OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
4081         OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4082         spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4083         i_size_write(t_inode, size);
4084         t_inode->i_blocks = s_inode->i_blocks;
4085
4086         di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4087         di->i_clusters = s_di->i_clusters;
4088         di->i_size = s_di->i_size;
4089         di->i_dyn_features = s_di->i_dyn_features;
4090         di->i_attr = s_di->i_attr;
4091
4092         if (preserve) {
4093                 t_inode->i_uid = s_inode->i_uid;
4094                 t_inode->i_gid = s_inode->i_gid;
4095                 t_inode->i_mode = s_inode->i_mode;
4096                 di->i_uid = s_di->i_uid;
4097                 di->i_gid = s_di->i_gid;
4098                 di->i_mode = s_di->i_mode;
4099
4100                 /*
4101                  * update time.
4102                  * we want mtime to appear identical to the source and
4103                  * update ctime.
4104                  */
4105                 t_inode->i_ctime = current_time(t_inode);
4106
4107                 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
4108                 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
4109
4110                 t_inode->i_mtime = s_inode->i_mtime;
4111                 di->i_mtime = s_di->i_mtime;
4112                 di->i_mtime_nsec = s_di->i_mtime_nsec;
4113         }
4114
4115         ocfs2_journal_dirty(handle, t_bh);
4116
4117 out_commit:
4118         ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
4119         return ret;
4120 }
4121
4122 static int ocfs2_create_reflink_node(struct inode *s_inode,
4123                                      struct buffer_head *s_bh,
4124                                      struct inode *t_inode,
4125                                      struct buffer_head *t_bh,
4126                                      bool preserve)
4127 {
4128         int ret;
4129         struct buffer_head *ref_root_bh = NULL;
4130         struct ocfs2_cached_dealloc_ctxt dealloc;
4131         struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4132         struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4133         struct ocfs2_refcount_tree *ref_tree;
4134
4135         ocfs2_init_dealloc_ctxt(&dealloc);
4136
4137         ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4138                                       le64_to_cpu(di->i_refcount_loc));
4139         if (ret) {
4140                 mlog_errno(ret);
4141                 goto out;
4142         }
4143
4144         if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4145                 ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4146                                                   t_inode, t_bh);
4147                 if (ret)
4148                         mlog_errno(ret);
4149                 goto out;
4150         }
4151
4152         ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4153                                        1, &ref_tree, &ref_root_bh);
4154         if (ret) {
4155                 mlog_errno(ret);
4156                 goto out;
4157         }
4158
4159         ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4160                                           &ref_tree->rf_ci, ref_root_bh,
4161                                           &dealloc);
4162         if (ret) {
4163                 mlog_errno(ret);
4164                 goto out_unlock_refcount;
4165         }
4166
4167 out_unlock_refcount:
4168         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4169         brelse(ref_root_bh);
4170 out:
4171         if (ocfs2_dealloc_has_cluster(&dealloc)) {
4172                 ocfs2_schedule_truncate_log_flush(osb, 1);
4173                 ocfs2_run_deallocs(osb, &dealloc);
4174         }
4175
4176         return ret;
4177 }
4178
4179 static int __ocfs2_reflink(struct dentry *old_dentry,
4180                            struct buffer_head *old_bh,
4181                            struct inode *new_inode,
4182                            bool preserve)
4183 {
4184         int ret;
4185         struct inode *inode = d_inode(old_dentry);
4186         struct buffer_head *new_bh = NULL;
4187
4188         if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4189                 ret = -EINVAL;
4190                 mlog_errno(ret);
4191                 goto out;
4192         }
4193
4194         ret = filemap_fdatawrite(inode->i_mapping);
4195         if (ret) {
4196                 mlog_errno(ret);
4197                 goto out;
4198         }
4199
4200         ret = ocfs2_attach_refcount_tree(inode, old_bh);
4201         if (ret) {
4202                 mlog_errno(ret);
4203                 goto out;
4204         }
4205
4206         inode_lock_nested(new_inode, I_MUTEX_CHILD);
4207         ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4208                                       OI_LS_REFLINK_TARGET);
4209         if (ret) {
4210                 mlog_errno(ret);
4211                 goto out_unlock;
4212         }
4213
4214         ret = ocfs2_create_reflink_node(inode, old_bh,
4215                                         new_inode, new_bh, preserve);
4216         if (ret) {
4217                 mlog_errno(ret);
4218                 goto inode_unlock;
4219         }
4220
4221         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4222                 ret = ocfs2_reflink_xattrs(inode, old_bh,
4223                                            new_inode, new_bh,
4224                                            preserve);
4225                 if (ret) {
4226                         mlog_errno(ret);
4227                         goto inode_unlock;
4228                 }
4229         }
4230
4231         ret = ocfs2_complete_reflink(inode, old_bh,
4232                                      new_inode, new_bh, preserve);
4233         if (ret)
4234                 mlog_errno(ret);
4235
4236 inode_unlock:
4237         ocfs2_inode_unlock(new_inode, 1);
4238         brelse(new_bh);
4239 out_unlock:
4240         inode_unlock(new_inode);
4241 out:
4242         if (!ret) {
4243                 ret = filemap_fdatawait(inode->i_mapping);
4244                 if (ret)
4245                         mlog_errno(ret);
4246         }
4247         return ret;
4248 }
4249
4250 static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4251                          struct dentry *new_dentry, bool preserve)
4252 {
4253         int error, had_lock;
4254         struct inode *inode = d_inode(old_dentry);
4255         struct buffer_head *old_bh = NULL;
4256         struct inode *new_orphan_inode = NULL;
4257         struct ocfs2_lock_holder oh;
4258
4259         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4260                 return -EOPNOTSUPP;
4261
4262
4263         error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4264                                              &new_orphan_inode);
4265         if (error) {
4266                 mlog_errno(error);
4267                 goto out;
4268         }
4269
4270         error = ocfs2_rw_lock(inode, 1);
4271         if (error) {
4272                 mlog_errno(error);
4273                 goto out;
4274         }
4275
4276         error = ocfs2_inode_lock(inode, &old_bh, 1);
4277         if (error) {
4278                 mlog_errno(error);
4279                 ocfs2_rw_unlock(inode, 1);
4280                 goto out;
4281         }
4282
4283         down_write(&OCFS2_I(inode)->ip_xattr_sem);
4284         down_write(&OCFS2_I(inode)->ip_alloc_sem);
4285         error = __ocfs2_reflink(old_dentry, old_bh,
4286                                 new_orphan_inode, preserve);
4287         up_write(&OCFS2_I(inode)->ip_alloc_sem);
4288         up_write(&OCFS2_I(inode)->ip_xattr_sem);
4289
4290         ocfs2_inode_unlock(inode, 1);
4291         ocfs2_rw_unlock(inode, 1);
4292         brelse(old_bh);
4293
4294         if (error) {
4295                 mlog_errno(error);
4296                 goto out;
4297         }
4298
4299         had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1,
4300                                             &oh);
4301         if (had_lock < 0) {
4302                 error = had_lock;
4303                 mlog_errno(error);
4304                 goto out;
4305         }
4306
4307         /* If the security isn't preserved, we need to re-initialize them. */
4308         if (!preserve) {
4309                 error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4310                                                     &new_dentry->d_name);
4311                 if (error)
4312                         mlog_errno(error);
4313         }
4314         if (!error) {
4315                 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4316                                                        new_dentry);
4317                 if (error)
4318                         mlog_errno(error);
4319         }
4320         ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock);
4321
4322 out:
4323         if (new_orphan_inode) {
4324                 /*
4325                  * We need to open_unlock the inode no matter whether we
4326                  * succeed or not, so that other nodes can delete it later.
4327                  */
4328                 ocfs2_open_unlock(new_orphan_inode);
4329                 if (error)
4330                         iput(new_orphan_inode);
4331         }
4332
4333         return error;
4334 }
4335
4336 /*
4337  * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4338  * sys_reflink().  This will go away when vfs_reflink() exists in
4339  * fs/namei.c.
4340  */
4341
4342 /* copied from may_create in VFS. */
4343 static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4344 {
4345         if (d_really_is_positive(child))
4346                 return -EEXIST;
4347         if (IS_DEADDIR(dir))
4348                 return -ENOENT;
4349         return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
4350 }
4351
4352 /**
4353  * ocfs2_vfs_reflink - Create a reference-counted link
4354  *
4355  * @old_dentry:        source dentry + inode
4356  * @dir:       directory to create the target
4357  * @new_dentry:        target dentry
4358  * @preserve:  if true, preserve all file attributes
4359  */
4360 static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4361                              struct dentry *new_dentry, bool preserve)
4362 {
4363         struct inode *inode = d_inode(old_dentry);
4364         int error;
4365
4366         if (!inode)
4367                 return -ENOENT;
4368
4369         error = ocfs2_may_create(dir, new_dentry);
4370         if (error)
4371                 return error;
4372
4373         if (dir->i_sb != inode->i_sb)
4374                 return -EXDEV;
4375
4376         /*
4377          * A reflink to an append-only or immutable file cannot be created.
4378          */
4379         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4380                 return -EPERM;
4381
4382         /* Only regular files can be reflinked. */
4383         if (!S_ISREG(inode->i_mode))
4384                 return -EPERM;
4385
4386         /*
4387          * If the caller wants to preserve ownership, they require the
4388          * rights to do so.
4389          */
4390         if (preserve) {
4391                 if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
4392                         return -EPERM;
4393                 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4394                         return -EPERM;
4395         }
4396
4397         /*
4398          * If the caller is modifying any aspect of the attributes, they
4399          * are not creating a snapshot.  They need read permission on the
4400          * file.
4401          */
4402         if (!preserve) {
4403                 error = inode_permission(&init_user_ns, inode, MAY_READ);
4404                 if (error)
4405                         return error;
4406         }
4407
4408         inode_lock(inode);
4409         error = dquot_initialize(dir);
4410         if (!error)
4411                 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4412         inode_unlock(inode);
4413         if (!error)
4414                 fsnotify_create(dir, new_dentry);
4415         return error;
4416 }
4417 /*
4418  * Most codes are copied from sys_linkat.
4419  */
4420 int ocfs2_reflink_ioctl(struct inode *inode,
4421                         const char __user *oldname,
4422                         const char __user *newname,
4423                         bool preserve)
4424 {
4425         struct dentry *new_dentry;
4426         struct path old_path, new_path;
4427         int error;
4428
4429         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4430                 return -EOPNOTSUPP;
4431
4432         error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4433         if (error) {
4434                 mlog_errno(error);
4435                 return error;
4436         }
4437
4438         new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
4439         error = PTR_ERR(new_dentry);
4440         if (IS_ERR(new_dentry)) {
4441                 mlog_errno(error);
4442                 goto out;
4443         }
4444
4445         error = -EXDEV;
4446         if (old_path.mnt != new_path.mnt) {
4447                 mlog_errno(error);
4448                 goto out_dput;
4449         }
4450
4451         error = ocfs2_vfs_reflink(old_path.dentry,
4452                                   d_inode(new_path.dentry),
4453                                   new_dentry, preserve);
4454 out_dput:
4455         done_path_create(&new_path, new_dentry);
4456 out:
4457         path_put(&old_path);
4458
4459         return error;
4460 }
4461
4462 /* Update destination inode size, if necessary. */
4463 int ocfs2_reflink_update_dest(struct inode *dest,
4464                               struct buffer_head *d_bh,
4465                               loff_t newlen)
4466 {
4467         handle_t *handle;
4468         int ret;
4469
4470         dest->i_blocks = ocfs2_inode_sector_count(dest);
4471
4472         if (newlen <= i_size_read(dest))
4473                 return 0;
4474
4475         handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
4476                                    OCFS2_INODE_UPDATE_CREDITS);
4477         if (IS_ERR(handle)) {
4478                 ret = PTR_ERR(handle);
4479                 mlog_errno(ret);
4480                 return ret;
4481         }
4482
4483         /* Extend i_size if needed. */
4484         spin_lock(&OCFS2_I(dest)->ip_lock);
4485         if (newlen > i_size_read(dest))
4486                 i_size_write(dest, newlen);
4487         spin_unlock(&OCFS2_I(dest)->ip_lock);
4488         dest->i_ctime = dest->i_mtime = current_time(dest);
4489
4490         ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
4491         if (ret) {
4492                 mlog_errno(ret);
4493                 goto out_commit;
4494         }
4495
4496 out_commit:
4497         ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
4498         return ret;
4499 }
4500
4501 /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
4502 static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
4503                                          struct buffer_head *s_bh,
4504                                          loff_t pos_in,
4505                                          struct inode *t_inode,
4506                                          struct buffer_head *t_bh,
4507                                          loff_t pos_out,
4508                                          loff_t len,
4509                                          struct ocfs2_cached_dealloc_ctxt *dealloc)
4510 {
4511         struct ocfs2_extent_tree s_et;
4512         struct ocfs2_extent_tree t_et;
4513         struct ocfs2_dinode *dis;
4514         struct buffer_head *ref_root_bh = NULL;
4515         struct ocfs2_refcount_tree *ref_tree;
4516         struct ocfs2_super *osb;
4517         loff_t remapped_bytes = 0;
4518         loff_t pstart, plen;
4519         u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
4520         unsigned int ext_flags;
4521         int ret = 0;
4522
4523         osb = OCFS2_SB(s_inode->i_sb);
4524         dis = (struct ocfs2_dinode *)s_bh->b_data;
4525         ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
4526         ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
4527
4528         spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
4529         tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
4530         slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
4531
4532         while (spos < slast) {
4533                 if (fatal_signal_pending(current)) {
4534                         ret = -EINTR;
4535                         goto out;
4536                 }
4537
4538                 /* Look up the extent. */
4539                 ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
4540                                          &num_clusters, &ext_flags);
4541                 if (ret) {
4542                         mlog_errno(ret);
4543                         goto out;
4544                 }
4545
4546                 num_clusters = min_t(u32, num_clusters, slast - spos);
4547
4548                 /* Punch out the dest range. */
4549                 pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
4550                 plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
4551                 ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
4552                 if (ret) {
4553                         mlog_errno(ret);
4554                         goto out;
4555                 }
4556
4557                 if (p_cluster == 0)
4558                         goto next_loop;
4559
4560                 /* Lock the refcount btree... */
4561                 ret = ocfs2_lock_refcount_tree(osb,
4562                                                le64_to_cpu(dis->i_refcount_loc),
4563                                                1, &ref_tree, &ref_root_bh);
4564                 if (ret) {
4565                         mlog_errno(ret);
4566                         goto out;
4567                 }
4568
4569                 /* Mark s_inode's extent as refcounted. */
4570                 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
4571                         ret = ocfs2_add_refcount_flag(s_inode, &s_et,
4572                                                       &ref_tree->rf_ci,
4573                                                       ref_root_bh, spos,
4574                                                       p_cluster, num_clusters,
4575                                                       dealloc, NULL);
4576                         if (ret) {
4577                                 mlog_errno(ret);
4578                                 goto out_unlock_refcount;
4579                         }
4580                 }
4581
4582                 /* Map in the new extent. */
4583                 ext_flags |= OCFS2_EXT_REFCOUNTED;
4584                 ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
4585                                                   &ref_tree->rf_ci,
4586                                                   ref_root_bh,
4587                                                   tpos, p_cluster,
4588                                                   num_clusters,
4589                                                   ext_flags,
4590                                                   dealloc);
4591                 if (ret) {
4592                         mlog_errno(ret);
4593                         goto out_unlock_refcount;
4594                 }
4595
4596                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4597                 brelse(ref_root_bh);
4598 next_loop:
4599                 spos += num_clusters;
4600                 tpos += num_clusters;
4601                 remapped_clus += num_clusters;
4602         }
4603
4604         goto out;
4605 out_unlock_refcount:
4606         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4607         brelse(ref_root_bh);
4608 out:
4609         remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
4610         remapped_bytes = min_t(loff_t, len, remapped_bytes);
4611
4612         return remapped_bytes > 0 ? remapped_bytes : ret;
4613 }
4614
4615 /* Set up refcount tree and remap s_inode to t_inode. */
4616 loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
4617                                   struct buffer_head *s_bh,
4618                                   loff_t pos_in,
4619                                   struct inode *t_inode,
4620                                   struct buffer_head *t_bh,
4621                                   loff_t pos_out,
4622                                   loff_t len)
4623 {
4624         struct ocfs2_cached_dealloc_ctxt dealloc;
4625         struct ocfs2_super *osb;
4626         struct ocfs2_dinode *dis;
4627         struct ocfs2_dinode *dit;
4628         loff_t ret;
4629
4630         osb = OCFS2_SB(s_inode->i_sb);
4631         dis = (struct ocfs2_dinode *)s_bh->b_data;
4632         dit = (struct ocfs2_dinode *)t_bh->b_data;
4633         ocfs2_init_dealloc_ctxt(&dealloc);
4634
4635         /*
4636          * If we're reflinking the entire file and the source is inline
4637          * data, just copy the contents.
4638          */
4639         if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
4640             i_size_read(t_inode) <= len &&
4641             (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
4642                 ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
4643                 if (ret)
4644                         mlog_errno(ret);
4645                 goto out;
4646         }
4647
4648         /*
4649          * If both inodes belong to two different refcount groups then
4650          * forget it because we don't know how (or want) to go merging
4651          * refcount trees.
4652          */
4653         ret = -EOPNOTSUPP;
4654         if (ocfs2_is_refcount_inode(s_inode) &&
4655             ocfs2_is_refcount_inode(t_inode) &&
4656             le64_to_cpu(dis->i_refcount_loc) !=
4657             le64_to_cpu(dit->i_refcount_loc))
4658                 goto out;
4659
4660         /* Neither inode has a refcount tree.  Add one to s_inode. */
4661         if (!ocfs2_is_refcount_inode(s_inode) &&
4662             !ocfs2_is_refcount_inode(t_inode)) {
4663                 ret = ocfs2_create_refcount_tree(s_inode, s_bh);
4664                 if (ret) {
4665                         mlog_errno(ret);
4666                         goto out;
4667                 }
4668         }
4669
4670         /* Ensure that both inodes end up with the same refcount tree. */
4671         if (!ocfs2_is_refcount_inode(s_inode)) {
4672                 ret = ocfs2_set_refcount_tree(s_inode, s_bh,
4673                                               le64_to_cpu(dit->i_refcount_loc));
4674                 if (ret) {
4675                         mlog_errno(ret);
4676                         goto out;
4677                 }
4678         }
4679         if (!ocfs2_is_refcount_inode(t_inode)) {
4680                 ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4681                                               le64_to_cpu(dis->i_refcount_loc));
4682                 if (ret) {
4683                         mlog_errno(ret);
4684                         goto out;
4685                 }
4686         }
4687
4688         /* Turn off inline data in the dest file. */
4689         if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4690                 ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
4691                 if (ret) {
4692                         mlog_errno(ret);
4693                         goto out;
4694                 }
4695         }
4696
4697         /* Actually remap extents now. */
4698         ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
4699                                          pos_out, len, &dealloc);
4700         if (ret < 0) {
4701                 mlog_errno(ret);
4702                 goto out;
4703         }
4704
4705 out:
4706         if (ocfs2_dealloc_has_cluster(&dealloc)) {
4707                 ocfs2_schedule_truncate_log_flush(osb, 1);
4708                 ocfs2_run_deallocs(osb, &dealloc);
4709         }
4710
4711         return ret;
4712 }
4713
4714 /* Lock an inode and grab a bh pointing to the inode. */
4715 int ocfs2_reflink_inodes_lock(struct inode *s_inode,
4716                               struct buffer_head **bh_s,
4717                               struct inode *t_inode,
4718                               struct buffer_head **bh_t)
4719 {
4720         struct inode *inode1 = s_inode;
4721         struct inode *inode2 = t_inode;
4722         struct ocfs2_inode_info *oi1;
4723         struct ocfs2_inode_info *oi2;
4724         struct buffer_head *bh1 = NULL;
4725         struct buffer_head *bh2 = NULL;
4726         bool same_inode = (s_inode == t_inode);
4727         bool need_swap = (inode1->i_ino > inode2->i_ino);
4728         int status;
4729
4730         /* First grab the VFS and rw locks. */
4731         lock_two_nondirectories(s_inode, t_inode);
4732         if (need_swap)
4733                 swap(inode1, inode2);
4734
4735         status = ocfs2_rw_lock(inode1, 1);
4736         if (status) {
4737                 mlog_errno(status);
4738                 goto out_i1;
4739         }
4740         if (!same_inode) {
4741                 status = ocfs2_rw_lock(inode2, 1);
4742                 if (status) {
4743                         mlog_errno(status);
4744                         goto out_i2;
4745                 }
4746         }
4747
4748         /* Now go for the cluster locks */
4749         oi1 = OCFS2_I(inode1);
4750         oi2 = OCFS2_I(inode2);
4751
4752         trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
4753                                 (unsigned long long)oi2->ip_blkno);
4754
4755         /* We always want to lock the one with the lower lockid first. */
4756         if (oi1->ip_blkno > oi2->ip_blkno)
4757                 mlog_errno(-ENOLCK);
4758
4759         /* lock id1 */
4760         status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
4761                                          OI_LS_REFLINK_TARGET);
4762         if (status < 0) {
4763                 if (status != -ENOENT)
4764                         mlog_errno(status);
4765                 goto out_rw2;
4766         }
4767
4768         /* lock id2 */
4769         if (!same_inode) {
4770                 status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
4771                                                  OI_LS_REFLINK_TARGET);
4772                 if (status < 0) {
4773                         if (status != -ENOENT)
4774                                 mlog_errno(status);
4775                         goto out_cl1;
4776                 }
4777         } else {
4778                 bh2 = bh1;
4779         }
4780
4781         /*
4782          * If we swapped inode order above, we have to swap the buffer heads
4783          * before passing them back to the caller.
4784          */
4785         if (need_swap)
4786                 swap(bh1, bh2);
4787         *bh_s = bh1;
4788         *bh_t = bh2;
4789
4790         trace_ocfs2_double_lock_end(
4791                         (unsigned long long)oi1->ip_blkno,
4792                         (unsigned long long)oi2->ip_blkno);
4793
4794         return 0;
4795
4796 out_cl1:
4797         ocfs2_inode_unlock(inode1, 1);
4798         brelse(bh1);
4799 out_rw2:
4800         ocfs2_rw_unlock(inode2, 1);
4801 out_i2:
4802         ocfs2_rw_unlock(inode1, 1);
4803 out_i1:
4804         unlock_two_nondirectories(s_inode, t_inode);
4805         return status;
4806 }
4807
4808 /* Unlock both inodes and release buffers. */
4809 void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
4810                                  struct buffer_head *s_bh,
4811                                  struct inode *t_inode,
4812                                  struct buffer_head *t_bh)
4813 {
4814         ocfs2_inode_unlock(s_inode, 1);
4815         ocfs2_rw_unlock(s_inode, 1);
4816         brelse(s_bh);
4817         if (s_inode != t_inode) {
4818                 ocfs2_inode_unlock(t_inode, 1);
4819                 ocfs2_rw_unlock(t_inode, 1);
4820                 brelse(t_bh);
4821         }
4822         unlock_two_nondirectories(s_inode, t_inode);
4823 }