ocfs2: acknowledge return value of ocfs2_error()
[linux-2.6-microblaze.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #include <cluster/masklog.h>
33
34 #include "ocfs2.h"
35
36 #include "alloc.h"
37 #include "blockcheck.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 #include "ocfs2_trace.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_bg_stable_blkno; /*
61                                              * Doesn't change, always
62                                              * set to target block
63                                              * group descriptor
64                                              * block.
65                                              */
66         u64             sr_blkno;       /* The first allocated block */
67         unsigned int    sr_bit_offset;  /* The bit in the bg */
68         unsigned int    sr_bits;        /* How many bits we claimed */
69 };
70
71 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72 {
73         if (res->sr_blkno == 0)
74                 return 0;
75
76         if (res->sr_bg_blkno)
77                 return res->sr_bg_blkno;
78
79         return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80 }
81
82 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
83 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
84 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
85 static int ocfs2_block_group_fill(handle_t *handle,
86                                   struct inode *alloc_inode,
87                                   struct buffer_head *bg_bh,
88                                   u64 group_blkno,
89                                   unsigned int group_clusters,
90                                   u16 my_chain,
91                                   struct ocfs2_chain_list *cl);
92 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
93                                    struct inode *alloc_inode,
94                                    struct buffer_head *bh,
95                                    u64 max_block,
96                                    u64 *last_alloc_group,
97                                    int flags);
98
99 static int ocfs2_cluster_group_search(struct inode *inode,
100                                       struct buffer_head *group_bh,
101                                       u32 bits_wanted, u32 min_bits,
102                                       u64 max_block,
103                                       struct ocfs2_suballoc_result *res);
104 static int ocfs2_block_group_search(struct inode *inode,
105                                     struct buffer_head *group_bh,
106                                     u32 bits_wanted, u32 min_bits,
107                                     u64 max_block,
108                                     struct ocfs2_suballoc_result *res);
109 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
110                                      handle_t *handle,
111                                      u32 bits_wanted,
112                                      u32 min_bits,
113                                      struct ocfs2_suballoc_result *res);
114 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115                                          int nr);
116 static int ocfs2_relink_block_group(handle_t *handle,
117                                     struct inode *alloc_inode,
118                                     struct buffer_head *fe_bh,
119                                     struct buffer_head *bg_bh,
120                                     struct buffer_head *prev_bg_bh,
121                                     u16 chain);
122 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
123                                                      u32 wanted);
124 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
125                                                    u64 bg_blkno,
126                                                    u16 bg_bit_off);
127 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128                                                 u64 data_blkno,
129                                                 u64 *bg_blkno,
130                                                 u16 *bg_bit_off);
131 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
132                                              u32 bits_wanted, u64 max_block,
133                                              int flags,
134                                              struct ocfs2_alloc_context **ac);
135
136 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
137 {
138         struct inode *inode = ac->ac_inode;
139
140         if (inode) {
141                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
142                         ocfs2_inode_unlock(inode, 1);
143
144                 mutex_unlock(&inode->i_mutex);
145
146                 iput(inode);
147                 ac->ac_inode = NULL;
148         }
149         brelse(ac->ac_bh);
150         ac->ac_bh = NULL;
151         ac->ac_resv = NULL;
152         if (ac->ac_find_loc_priv) {
153                 kfree(ac->ac_find_loc_priv);
154                 ac->ac_find_loc_priv = NULL;
155         }
156 }
157
158 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
159 {
160         ocfs2_free_ac_resource(ac);
161         kfree(ac);
162 }
163
164 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
165 {
166         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
167 }
168
169 #define do_error(fmt, ...)                                              \
170         do{                                                             \
171                 if (resize)                                     \
172                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
173                 else                                                    \
174                         return ocfs2_error(sb, fmt, ##__VA_ARGS__);             \
175         } while (0)
176
177 static int ocfs2_validate_gd_self(struct super_block *sb,
178                                   struct buffer_head *bh,
179                                   int resize)
180 {
181         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
182
183         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
184                 do_error("Group descriptor #%llu has bad signature %.*s",
185                          (unsigned long long)bh->b_blocknr, 7,
186                          gd->bg_signature);
187         }
188
189         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
190                 do_error("Group descriptor #%llu has an invalid bg_blkno "
191                          "of %llu",
192                          (unsigned long long)bh->b_blocknr,
193                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
194         }
195
196         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
197                 do_error("Group descriptor #%llu has an invalid "
198                          "fs_generation of #%u",
199                          (unsigned long long)bh->b_blocknr,
200                          le32_to_cpu(gd->bg_generation));
201         }
202
203         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
204                 do_error("Group descriptor #%llu has bit count %u but "
205                          "claims that %u are free",
206                          (unsigned long long)bh->b_blocknr,
207                          le16_to_cpu(gd->bg_bits),
208                          le16_to_cpu(gd->bg_free_bits_count));
209         }
210
211         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
212                 do_error("Group descriptor #%llu has bit count %u but "
213                          "max bitmap bits of %u",
214                          (unsigned long long)bh->b_blocknr,
215                          le16_to_cpu(gd->bg_bits),
216                          8 * le16_to_cpu(gd->bg_size));
217         }
218
219         return 0;
220 }
221
222 static int ocfs2_validate_gd_parent(struct super_block *sb,
223                                     struct ocfs2_dinode *di,
224                                     struct buffer_head *bh,
225                                     int resize)
226 {
227         unsigned int max_bits;
228         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
229
230         if (di->i_blkno != gd->bg_parent_dinode) {
231                 do_error("Group descriptor #%llu has bad parent "
232                          "pointer (%llu, expected %llu)",
233                          (unsigned long long)bh->b_blocknr,
234                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
235                          (unsigned long long)le64_to_cpu(di->i_blkno));
236         }
237
238         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
239         if (le16_to_cpu(gd->bg_bits) > max_bits) {
240                 do_error("Group descriptor #%llu has bit count of %u",
241                          (unsigned long long)bh->b_blocknr,
242                          le16_to_cpu(gd->bg_bits));
243         }
244
245         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
246         if ((le16_to_cpu(gd->bg_chain) >
247              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
248             ((le16_to_cpu(gd->bg_chain) ==
249              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
250                 do_error("Group descriptor #%llu has bad chain %u",
251                          (unsigned long long)bh->b_blocknr,
252                          le16_to_cpu(gd->bg_chain));
253         }
254
255         return 0;
256 }
257
258 #undef do_error
259
260 /*
261  * This version only prints errors.  It does not fail the filesystem, and
262  * exists only for resize.
263  */
264 int ocfs2_check_group_descriptor(struct super_block *sb,
265                                  struct ocfs2_dinode *di,
266                                  struct buffer_head *bh)
267 {
268         int rc;
269         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
270
271         BUG_ON(!buffer_uptodate(bh));
272
273         /*
274          * If the ecc fails, we return the error but otherwise
275          * leave the filesystem running.  We know any error is
276          * local to this block.
277          */
278         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
279         if (rc) {
280                 mlog(ML_ERROR,
281                      "Checksum failed for group descriptor %llu\n",
282                      (unsigned long long)bh->b_blocknr);
283         } else
284                 rc = ocfs2_validate_gd_self(sb, bh, 1);
285         if (!rc)
286                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
287
288         return rc;
289 }
290
291 static int ocfs2_validate_group_descriptor(struct super_block *sb,
292                                            struct buffer_head *bh)
293 {
294         int rc;
295         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
296
297         trace_ocfs2_validate_group_descriptor(
298                                         (unsigned long long)bh->b_blocknr);
299
300         BUG_ON(!buffer_uptodate(bh));
301
302         /*
303          * If the ecc fails, we return the error but otherwise
304          * leave the filesystem running.  We know any error is
305          * local to this block.
306          */
307         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
308         if (rc)
309                 return rc;
310
311         /*
312          * Errors after here are fatal.
313          */
314
315         return ocfs2_validate_gd_self(sb, bh, 0);
316 }
317
318 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
319                                 u64 gd_blkno, struct buffer_head **bh)
320 {
321         int rc;
322         struct buffer_head *tmp = *bh;
323
324         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
325                               ocfs2_validate_group_descriptor);
326         if (rc)
327                 goto out;
328
329         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
330         if (rc) {
331                 brelse(tmp);
332                 goto out;
333         }
334
335         /* If ocfs2_read_block() got us a new bh, pass it up. */
336         if (!*bh)
337                 *bh = tmp;
338
339 out:
340         return rc;
341 }
342
343 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
344                                           struct ocfs2_group_desc *bg,
345                                           struct ocfs2_chain_list *cl,
346                                           u64 p_blkno, unsigned int clusters)
347 {
348         struct ocfs2_extent_list *el = &bg->bg_list;
349         struct ocfs2_extent_rec *rec;
350
351         BUG_ON(!ocfs2_supports_discontig_bg(osb));
352         if (!el->l_next_free_rec)
353                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
354         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
355         rec->e_blkno = cpu_to_le64(p_blkno);
356         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
357                                   le16_to_cpu(cl->cl_bpc));
358         rec->e_leaf_clusters = cpu_to_le16(clusters);
359         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
360         le16_add_cpu(&bg->bg_free_bits_count,
361                      clusters * le16_to_cpu(cl->cl_bpc));
362         le16_add_cpu(&el->l_next_free_rec, 1);
363 }
364
365 static int ocfs2_block_group_fill(handle_t *handle,
366                                   struct inode *alloc_inode,
367                                   struct buffer_head *bg_bh,
368                                   u64 group_blkno,
369                                   unsigned int group_clusters,
370                                   u16 my_chain,
371                                   struct ocfs2_chain_list *cl)
372 {
373         int status = 0;
374         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
375         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
376         struct super_block * sb = alloc_inode->i_sb;
377
378         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
379                 status = ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
380                             "b_blocknr (%llu)",
381                             (unsigned long long)group_blkno,
382                             (unsigned long long) bg_bh->b_blocknr);
383                 goto bail;
384         }
385
386         status = ocfs2_journal_access_gd(handle,
387                                          INODE_CACHE(alloc_inode),
388                                          bg_bh,
389                                          OCFS2_JOURNAL_ACCESS_CREATE);
390         if (status < 0) {
391                 mlog_errno(status);
392                 goto bail;
393         }
394
395         memset(bg, 0, sb->s_blocksize);
396         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
397         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
398         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
399                                                 osb->s_feature_incompat));
400         bg->bg_chain = cpu_to_le16(my_chain);
401         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
402         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
403         bg->bg_blkno = cpu_to_le64(group_blkno);
404         if (group_clusters == le16_to_cpu(cl->cl_cpg))
405                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
406         else
407                 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
408                                               group_clusters);
409
410         /* set the 1st bit in the bitmap to account for the descriptor block */
411         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
412         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
413
414         ocfs2_journal_dirty(handle, bg_bh);
415
416         /* There is no need to zero out or otherwise initialize the
417          * other blocks in a group - All valid FS metadata in a block
418          * group stores the superblock fs_generation value at
419          * allocation time. */
420
421 bail:
422         if (status)
423                 mlog_errno(status);
424         return status;
425 }
426
427 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
428 {
429         u16 curr, best;
430
431         best = curr = 0;
432         while (curr < le16_to_cpu(cl->cl_count)) {
433                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
434                     le32_to_cpu(cl->cl_recs[curr].c_total))
435                         best = curr;
436                 curr++;
437         }
438         return best;
439 }
440
441 static struct buffer_head *
442 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
443                                struct inode *alloc_inode,
444                                struct ocfs2_alloc_context *ac,
445                                struct ocfs2_chain_list *cl)
446 {
447         int status;
448         u32 bit_off, num_bits;
449         u64 bg_blkno;
450         struct buffer_head *bg_bh;
451         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
452
453         status = ocfs2_claim_clusters(handle, ac,
454                                       le16_to_cpu(cl->cl_cpg), &bit_off,
455                                       &num_bits);
456         if (status < 0) {
457                 if (status != -ENOSPC)
458                         mlog_errno(status);
459                 goto bail;
460         }
461
462         /* setup the group */
463         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
464         trace_ocfs2_block_group_alloc_contig(
465              (unsigned long long)bg_blkno, alloc_rec);
466
467         bg_bh = sb_getblk(osb->sb, bg_blkno);
468         if (!bg_bh) {
469                 status = -ENOMEM;
470                 mlog_errno(status);
471                 goto bail;
472         }
473         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
474
475         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
476                                         bg_blkno, num_bits, alloc_rec, cl);
477         if (status < 0) {
478                 brelse(bg_bh);
479                 mlog_errno(status);
480         }
481
482 bail:
483         return status ? ERR_PTR(status) : bg_bh;
484 }
485
486 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
487                                         handle_t *handle,
488                                         struct ocfs2_alloc_context *ac,
489                                         unsigned int min_bits,
490                                         u32 *bit_off, u32 *num_bits)
491 {
492         int status = 0;
493
494         while (min_bits) {
495                 status = ocfs2_claim_clusters(handle, ac, min_bits,
496                                               bit_off, num_bits);
497                 if (status != -ENOSPC)
498                         break;
499
500                 min_bits >>= 1;
501         }
502
503         return status;
504 }
505
506 static int ocfs2_block_group_grow_discontig(handle_t *handle,
507                                             struct inode *alloc_inode,
508                                             struct buffer_head *bg_bh,
509                                             struct ocfs2_alloc_context *ac,
510                                             struct ocfs2_chain_list *cl,
511                                             unsigned int min_bits)
512 {
513         int status;
514         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
515         struct ocfs2_group_desc *bg =
516                 (struct ocfs2_group_desc *)bg_bh->b_data;
517         unsigned int needed = le16_to_cpu(cl->cl_cpg) -
518                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
519         u32 p_cpos, clusters;
520         u64 p_blkno;
521         struct ocfs2_extent_list *el = &bg->bg_list;
522
523         status = ocfs2_journal_access_gd(handle,
524                                          INODE_CACHE(alloc_inode),
525                                          bg_bh,
526                                          OCFS2_JOURNAL_ACCESS_CREATE);
527         if (status < 0) {
528                 mlog_errno(status);
529                 goto bail;
530         }
531
532         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
533                                 le16_to_cpu(el->l_count))) {
534                 if (min_bits > needed)
535                         min_bits = needed;
536                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
537                                                       min_bits, &p_cpos,
538                                                       &clusters);
539                 if (status < 0) {
540                         if (status != -ENOSPC)
541                                 mlog_errno(status);
542                         goto bail;
543                 }
544                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
545                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
546                                               clusters);
547
548                 min_bits = clusters;
549                 needed = le16_to_cpu(cl->cl_cpg) -
550                          le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
551         }
552
553         if (needed > 0) {
554                 /*
555                  * We have used up all the extent rec but can't fill up
556                  * the cpg. So bail out.
557                  */
558                 status = -ENOSPC;
559                 goto bail;
560         }
561
562         ocfs2_journal_dirty(handle, bg_bh);
563
564 bail:
565         return status;
566 }
567
568 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
569                                    struct ocfs2_alloc_context *cluster_ac,
570                                    struct inode *alloc_inode,
571                                    struct buffer_head *bg_bh)
572 {
573         int i, ret;
574         struct ocfs2_group_desc *bg;
575         struct ocfs2_extent_list *el;
576         struct ocfs2_extent_rec *rec;
577
578         if (!bg_bh)
579                 return;
580
581         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
582         el = &bg->bg_list;
583         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
584                 rec = &el->l_recs[i];
585                 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
586                                           cluster_ac->ac_bh,
587                                           le64_to_cpu(rec->e_blkno),
588                                           le16_to_cpu(rec->e_leaf_clusters));
589                 if (ret)
590                         mlog_errno(ret);
591                 /* Try all the clusters to free */
592         }
593
594         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
595         brelse(bg_bh);
596 }
597
598 static struct buffer_head *
599 ocfs2_block_group_alloc_discontig(handle_t *handle,
600                                   struct inode *alloc_inode,
601                                   struct ocfs2_alloc_context *ac,
602                                   struct ocfs2_chain_list *cl)
603 {
604         int status;
605         u32 bit_off, num_bits;
606         u64 bg_blkno;
607         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
608         struct buffer_head *bg_bh = NULL;
609         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
610         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
611
612         if (!ocfs2_supports_discontig_bg(osb)) {
613                 status = -ENOSPC;
614                 goto bail;
615         }
616
617         status = ocfs2_extend_trans(handle,
618                                     ocfs2_calc_bg_discontig_credits(osb->sb));
619         if (status) {
620                 mlog_errno(status);
621                 goto bail;
622         }
623
624         /*
625          * We're going to be grabbing from multiple cluster groups.
626          * We don't have enough credits to relink them all, and the
627          * cluster groups will be staying in cache for the duration of
628          * this operation.
629          */
630         ac->ac_disable_chain_relink = 1;
631
632         /* Claim the first region */
633         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
634                                               &bit_off, &num_bits);
635         if (status < 0) {
636                 if (status != -ENOSPC)
637                         mlog_errno(status);
638                 goto bail;
639         }
640         min_bits = num_bits;
641
642         /* setup the group */
643         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
644         trace_ocfs2_block_group_alloc_discontig(
645                                 (unsigned long long)bg_blkno, alloc_rec);
646
647         bg_bh = sb_getblk(osb->sb, bg_blkno);
648         if (!bg_bh) {
649                 status = -ENOMEM;
650                 mlog_errno(status);
651                 goto bail;
652         }
653         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
654
655         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
656                                         bg_blkno, num_bits, alloc_rec, cl);
657         if (status < 0) {
658                 mlog_errno(status);
659                 goto bail;
660         }
661
662         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
663                                                   bg_bh, ac, cl, min_bits);
664         if (status)
665                 mlog_errno(status);
666
667 bail:
668         if (status)
669                 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
670         return status ? ERR_PTR(status) : bg_bh;
671 }
672
673 /*
674  * We expect the block group allocator to already be locked.
675  */
676 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
677                                    struct inode *alloc_inode,
678                                    struct buffer_head *bh,
679                                    u64 max_block,
680                                    u64 *last_alloc_group,
681                                    int flags)
682 {
683         int status, credits;
684         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
685         struct ocfs2_chain_list *cl;
686         struct ocfs2_alloc_context *ac = NULL;
687         handle_t *handle = NULL;
688         u16 alloc_rec;
689         struct buffer_head *bg_bh = NULL;
690         struct ocfs2_group_desc *bg;
691
692         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
693
694         cl = &fe->id2.i_chain;
695         status = ocfs2_reserve_clusters_with_limit(osb,
696                                                    le16_to_cpu(cl->cl_cpg),
697                                                    max_block, flags, &ac);
698         if (status < 0) {
699                 if (status != -ENOSPC)
700                         mlog_errno(status);
701                 goto bail;
702         }
703
704         credits = ocfs2_calc_group_alloc_credits(osb->sb,
705                                                  le16_to_cpu(cl->cl_cpg));
706         handle = ocfs2_start_trans(osb, credits);
707         if (IS_ERR(handle)) {
708                 status = PTR_ERR(handle);
709                 handle = NULL;
710                 mlog_errno(status);
711                 goto bail;
712         }
713
714         if (last_alloc_group && *last_alloc_group != 0) {
715                 trace_ocfs2_block_group_alloc(
716                                 (unsigned long long)*last_alloc_group);
717                 ac->ac_last_group = *last_alloc_group;
718         }
719
720         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
721                                                ac, cl);
722         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
723                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
724                                                           alloc_inode,
725                                                           ac, cl);
726         if (IS_ERR(bg_bh)) {
727                 status = PTR_ERR(bg_bh);
728                 bg_bh = NULL;
729                 if (status != -ENOSPC)
730                         mlog_errno(status);
731                 goto bail;
732         }
733         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
734
735         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
736                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
737         if (status < 0) {
738                 mlog_errno(status);
739                 goto bail;
740         }
741
742         alloc_rec = le16_to_cpu(bg->bg_chain);
743         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
744                      le16_to_cpu(bg->bg_free_bits_count));
745         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
746                      le16_to_cpu(bg->bg_bits));
747         cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
748         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
749                 le16_add_cpu(&cl->cl_next_free_rec, 1);
750
751         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
752                                         le16_to_cpu(bg->bg_free_bits_count));
753         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
754         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
755
756         ocfs2_journal_dirty(handle, bh);
757
758         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
759         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
760         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
761                                              le32_to_cpu(fe->i_clusters)));
762         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
763         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
764         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
765         ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
766
767         status = 0;
768
769         /* save the new last alloc group so that the caller can cache it. */
770         if (last_alloc_group)
771                 *last_alloc_group = ac->ac_last_group;
772
773 bail:
774         if (handle)
775                 ocfs2_commit_trans(osb, handle);
776
777         if (ac)
778                 ocfs2_free_alloc_context(ac);
779
780         brelse(bg_bh);
781
782         if (status)
783                 mlog_errno(status);
784         return status;
785 }
786
787 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
788                                        struct ocfs2_alloc_context *ac,
789                                        int type,
790                                        u32 slot,
791                                        u64 *last_alloc_group,
792                                        int flags)
793 {
794         int status;
795         u32 bits_wanted = ac->ac_bits_wanted;
796         struct inode *alloc_inode;
797         struct buffer_head *bh = NULL;
798         struct ocfs2_dinode *fe;
799         u32 free_bits;
800
801         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
802         if (!alloc_inode) {
803                 mlog_errno(-EINVAL);
804                 return -EINVAL;
805         }
806
807         mutex_lock(&alloc_inode->i_mutex);
808
809         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
810         if (status < 0) {
811                 mutex_unlock(&alloc_inode->i_mutex);
812                 iput(alloc_inode);
813
814                 mlog_errno(status);
815                 return status;
816         }
817
818         ac->ac_inode = alloc_inode;
819         ac->ac_alloc_slot = slot;
820
821         fe = (struct ocfs2_dinode *) bh->b_data;
822
823         /* The bh was validated by the inode read inside
824          * ocfs2_inode_lock().  Any corruption is a code bug. */
825         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
826
827         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
828                 status = ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
829                             (unsigned long long)le64_to_cpu(fe->i_blkno));
830                 goto bail;
831         }
832
833         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
834                 le32_to_cpu(fe->id1.bitmap1.i_used);
835
836         if (bits_wanted > free_bits) {
837                 /* cluster bitmap never grows */
838                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
839                         trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
840                                                                 free_bits);
841                         status = -ENOSPC;
842                         goto bail;
843                 }
844
845                 if (!(flags & ALLOC_NEW_GROUP)) {
846                         trace_ocfs2_reserve_suballoc_bits_no_new_group(
847                                                 slot, bits_wanted, free_bits);
848                         status = -ENOSPC;
849                         goto bail;
850                 }
851
852                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
853                                                  ac->ac_max_block,
854                                                  last_alloc_group, flags);
855                 if (status < 0) {
856                         if (status != -ENOSPC)
857                                 mlog_errno(status);
858                         goto bail;
859                 }
860                 atomic_inc(&osb->alloc_stats.bg_extends);
861
862                 /* You should never ask for this much metadata */
863                 BUG_ON(bits_wanted >
864                        (le32_to_cpu(fe->id1.bitmap1.i_total)
865                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
866         }
867
868         get_bh(bh);
869         ac->ac_bh = bh;
870 bail:
871         brelse(bh);
872
873         if (status)
874                 mlog_errno(status);
875         return status;
876 }
877
878 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
879 {
880         spin_lock(&osb->osb_lock);
881         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
882         spin_unlock(&osb->osb_lock);
883         atomic_set(&osb->s_num_inodes_stolen, 0);
884 }
885
886 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
887 {
888         spin_lock(&osb->osb_lock);
889         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
890         spin_unlock(&osb->osb_lock);
891         atomic_set(&osb->s_num_meta_stolen, 0);
892 }
893
894 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
895 {
896         ocfs2_init_inode_steal_slot(osb);
897         ocfs2_init_meta_steal_slot(osb);
898 }
899
900 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
901 {
902         spin_lock(&osb->osb_lock);
903         if (type == INODE_ALLOC_SYSTEM_INODE)
904                 osb->s_inode_steal_slot = slot;
905         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
906                 osb->s_meta_steal_slot = slot;
907         spin_unlock(&osb->osb_lock);
908 }
909
910 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
911 {
912         int slot = OCFS2_INVALID_SLOT;
913
914         spin_lock(&osb->osb_lock);
915         if (type == INODE_ALLOC_SYSTEM_INODE)
916                 slot = osb->s_inode_steal_slot;
917         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
918                 slot = osb->s_meta_steal_slot;
919         spin_unlock(&osb->osb_lock);
920
921         return slot;
922 }
923
924 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
925 {
926         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
927 }
928
929 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
930 {
931         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
932 }
933
934 static int ocfs2_steal_resource(struct ocfs2_super *osb,
935                                 struct ocfs2_alloc_context *ac,
936                                 int type)
937 {
938         int i, status = -ENOSPC;
939         int slot = __ocfs2_get_steal_slot(osb, type);
940
941         /* Start to steal resource from the first slot after ours. */
942         if (slot == OCFS2_INVALID_SLOT)
943                 slot = osb->slot_num + 1;
944
945         for (i = 0; i < osb->max_slots; i++, slot++) {
946                 if (slot == osb->max_slots)
947                         slot = 0;
948
949                 if (slot == osb->slot_num)
950                         continue;
951
952                 status = ocfs2_reserve_suballoc_bits(osb, ac,
953                                                      type,
954                                                      (u32)slot, NULL,
955                                                      NOT_ALLOC_NEW_GROUP);
956                 if (status >= 0) {
957                         __ocfs2_set_steal_slot(osb, slot, type);
958                         break;
959                 }
960
961                 ocfs2_free_ac_resource(ac);
962         }
963
964         return status;
965 }
966
967 static int ocfs2_steal_inode(struct ocfs2_super *osb,
968                              struct ocfs2_alloc_context *ac)
969 {
970         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
971 }
972
973 static int ocfs2_steal_meta(struct ocfs2_super *osb,
974                             struct ocfs2_alloc_context *ac)
975 {
976         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
977 }
978
979 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
980                                       int blocks,
981                                       struct ocfs2_alloc_context **ac)
982 {
983         int status;
984         int slot = ocfs2_get_meta_steal_slot(osb);
985
986         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
987         if (!(*ac)) {
988                 status = -ENOMEM;
989                 mlog_errno(status);
990                 goto bail;
991         }
992
993         (*ac)->ac_bits_wanted = blocks;
994         (*ac)->ac_which = OCFS2_AC_USE_META;
995         (*ac)->ac_group_search = ocfs2_block_group_search;
996
997         if (slot != OCFS2_INVALID_SLOT &&
998                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
999                 goto extent_steal;
1000
1001         atomic_set(&osb->s_num_meta_stolen, 0);
1002         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1003                                              EXTENT_ALLOC_SYSTEM_INODE,
1004                                              (u32)osb->slot_num, NULL,
1005                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1006
1007
1008         if (status >= 0) {
1009                 status = 0;
1010                 if (slot != OCFS2_INVALID_SLOT)
1011                         ocfs2_init_meta_steal_slot(osb);
1012                 goto bail;
1013         } else if (status < 0 && status != -ENOSPC) {
1014                 mlog_errno(status);
1015                 goto bail;
1016         }
1017
1018         ocfs2_free_ac_resource(*ac);
1019
1020 extent_steal:
1021         status = ocfs2_steal_meta(osb, *ac);
1022         atomic_inc(&osb->s_num_meta_stolen);
1023         if (status < 0) {
1024                 if (status != -ENOSPC)
1025                         mlog_errno(status);
1026                 goto bail;
1027         }
1028
1029         status = 0;
1030 bail:
1031         if ((status < 0) && *ac) {
1032                 ocfs2_free_alloc_context(*ac);
1033                 *ac = NULL;
1034         }
1035
1036         if (status)
1037                 mlog_errno(status);
1038         return status;
1039 }
1040
1041 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1042                                struct ocfs2_extent_list *root_el,
1043                                struct ocfs2_alloc_context **ac)
1044 {
1045         return ocfs2_reserve_new_metadata_blocks(osb,
1046                                         ocfs2_extend_meta_needed(root_el),
1047                                         ac);
1048 }
1049
1050 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1051                             struct ocfs2_alloc_context **ac)
1052 {
1053         int status;
1054         int slot = ocfs2_get_inode_steal_slot(osb);
1055         u64 alloc_group;
1056
1057         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1058         if (!(*ac)) {
1059                 status = -ENOMEM;
1060                 mlog_errno(status);
1061                 goto bail;
1062         }
1063
1064         (*ac)->ac_bits_wanted = 1;
1065         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1066
1067         (*ac)->ac_group_search = ocfs2_block_group_search;
1068
1069         /*
1070          * stat(2) can't handle i_ino > 32bits, so we tell the
1071          * lower levels not to allocate us a block group past that
1072          * limit.  The 'inode64' mount option avoids this behavior.
1073          */
1074         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1075                 (*ac)->ac_max_block = (u32)~0U;
1076
1077         /*
1078          * slot is set when we successfully steal inode from other nodes.
1079          * It is reset in 3 places:
1080          * 1. when we flush the truncate log
1081          * 2. when we complete local alloc recovery.
1082          * 3. when we successfully allocate from our own slot.
1083          * After it is set, we will go on stealing inodes until we find the
1084          * need to check our slots to see whether there is some space for us.
1085          */
1086         if (slot != OCFS2_INVALID_SLOT &&
1087             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1088                 goto inode_steal;
1089
1090         atomic_set(&osb->s_num_inodes_stolen, 0);
1091         alloc_group = osb->osb_inode_alloc_group;
1092         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1093                                              INODE_ALLOC_SYSTEM_INODE,
1094                                              (u32)osb->slot_num,
1095                                              &alloc_group,
1096                                              ALLOC_NEW_GROUP |
1097                                              ALLOC_GROUPS_FROM_GLOBAL);
1098         if (status >= 0) {
1099                 status = 0;
1100
1101                 spin_lock(&osb->osb_lock);
1102                 osb->osb_inode_alloc_group = alloc_group;
1103                 spin_unlock(&osb->osb_lock);
1104                 trace_ocfs2_reserve_new_inode_new_group(
1105                         (unsigned long long)alloc_group);
1106
1107                 /*
1108                  * Some inodes must be freed by us, so try to allocate
1109                  * from our own next time.
1110                  */
1111                 if (slot != OCFS2_INVALID_SLOT)
1112                         ocfs2_init_inode_steal_slot(osb);
1113                 goto bail;
1114         } else if (status < 0 && status != -ENOSPC) {
1115                 mlog_errno(status);
1116                 goto bail;
1117         }
1118
1119         ocfs2_free_ac_resource(*ac);
1120
1121 inode_steal:
1122         status = ocfs2_steal_inode(osb, *ac);
1123         atomic_inc(&osb->s_num_inodes_stolen);
1124         if (status < 0) {
1125                 if (status != -ENOSPC)
1126                         mlog_errno(status);
1127                 goto bail;
1128         }
1129
1130         status = 0;
1131 bail:
1132         if ((status < 0) && *ac) {
1133                 ocfs2_free_alloc_context(*ac);
1134                 *ac = NULL;
1135         }
1136
1137         if (status)
1138                 mlog_errno(status);
1139         return status;
1140 }
1141
1142 /* local alloc code has to do the same thing, so rather than do this
1143  * twice.. */
1144 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1145                                       struct ocfs2_alloc_context *ac)
1146 {
1147         int status;
1148
1149         ac->ac_which = OCFS2_AC_USE_MAIN;
1150         ac->ac_group_search = ocfs2_cluster_group_search;
1151
1152         status = ocfs2_reserve_suballoc_bits(osb, ac,
1153                                              GLOBAL_BITMAP_SYSTEM_INODE,
1154                                              OCFS2_INVALID_SLOT, NULL,
1155                                              ALLOC_NEW_GROUP);
1156         if (status < 0 && status != -ENOSPC) {
1157                 mlog_errno(status);
1158                 goto bail;
1159         }
1160
1161 bail:
1162         return status;
1163 }
1164
1165 /* Callers don't need to care which bitmap (local alloc or main) to
1166  * use so we figure it out for them, but unfortunately this clutters
1167  * things a bit. */
1168 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1169                                              u32 bits_wanted, u64 max_block,
1170                                              int flags,
1171                                              struct ocfs2_alloc_context **ac)
1172 {
1173         int status;
1174
1175         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1176         if (!(*ac)) {
1177                 status = -ENOMEM;
1178                 mlog_errno(status);
1179                 goto bail;
1180         }
1181
1182         (*ac)->ac_bits_wanted = bits_wanted;
1183         (*ac)->ac_max_block = max_block;
1184
1185         status = -ENOSPC;
1186         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1187             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1188                 status = ocfs2_reserve_local_alloc_bits(osb,
1189                                                         bits_wanted,
1190                                                         *ac);
1191                 if ((status < 0) && (status != -ENOSPC)) {
1192                         mlog_errno(status);
1193                         goto bail;
1194                 }
1195         }
1196
1197         if (status == -ENOSPC) {
1198                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1199                 if (status < 0) {
1200                         if (status != -ENOSPC)
1201                                 mlog_errno(status);
1202                         goto bail;
1203                 }
1204         }
1205
1206         status = 0;
1207 bail:
1208         if ((status < 0) && *ac) {
1209                 ocfs2_free_alloc_context(*ac);
1210                 *ac = NULL;
1211         }
1212
1213         if (status)
1214                 mlog_errno(status);
1215         return status;
1216 }
1217
1218 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1219                            u32 bits_wanted,
1220                            struct ocfs2_alloc_context **ac)
1221 {
1222         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1223                                                  ALLOC_NEW_GROUP, ac);
1224 }
1225
1226 /*
1227  * More or less lifted from ext3. I'll leave their description below:
1228  *
1229  * "For ext3 allocations, we must not reuse any blocks which are
1230  * allocated in the bitmap buffer's "last committed data" copy.  This
1231  * prevents deletes from freeing up the page for reuse until we have
1232  * committed the delete transaction.
1233  *
1234  * If we didn't do this, then deleting something and reallocating it as
1235  * data would allow the old block to be overwritten before the
1236  * transaction committed (because we force data to disk before commit).
1237  * This would lead to corruption if we crashed between overwriting the
1238  * data and committing the delete.
1239  *
1240  * @@@ We may want to make this allocation behaviour conditional on
1241  * data-writes at some point, and disable it for metadata allocations or
1242  * sync-data inodes."
1243  *
1244  * Note: OCFS2 already does this differently for metadata vs data
1245  * allocations, as those bitmaps are separate and undo access is never
1246  * called on a metadata group descriptor.
1247  */
1248 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1249                                          int nr)
1250 {
1251         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1252         int ret;
1253
1254         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1255                 return 0;
1256
1257         if (!buffer_jbd(bg_bh))
1258                 return 1;
1259
1260         jbd_lock_bh_state(bg_bh);
1261         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1262         if (bg)
1263                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1264         else
1265                 ret = 1;
1266         jbd_unlock_bh_state(bg_bh);
1267
1268         return ret;
1269 }
1270
1271 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1272                                              struct buffer_head *bg_bh,
1273                                              unsigned int bits_wanted,
1274                                              unsigned int total_bits,
1275                                              struct ocfs2_suballoc_result *res)
1276 {
1277         void *bitmap;
1278         u16 best_offset, best_size;
1279         int offset, start, found, status = 0;
1280         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1281
1282         /* Callers got this descriptor from
1283          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1284         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1285
1286         found = start = best_offset = best_size = 0;
1287         bitmap = bg->bg_bitmap;
1288
1289         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1290                 if (offset == total_bits)
1291                         break;
1292
1293                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1294                         /* We found a zero, but we can't use it as it
1295                          * hasn't been put to disk yet! */
1296                         found = 0;
1297                         start = offset + 1;
1298                 } else if (offset == start) {
1299                         /* we found a zero */
1300                         found++;
1301                         /* move start to the next bit to test */
1302                         start++;
1303                 } else {
1304                         /* got a zero after some ones */
1305                         found = 1;
1306                         start = offset + 1;
1307                 }
1308                 if (found > best_size) {
1309                         best_size = found;
1310                         best_offset = start - found;
1311                 }
1312                 /* we got everything we needed */
1313                 if (found == bits_wanted) {
1314                         /* mlog(0, "Found it all!\n"); */
1315                         break;
1316                 }
1317         }
1318
1319         if (best_size) {
1320                 res->sr_bit_offset = best_offset;
1321                 res->sr_bits = best_size;
1322         } else {
1323                 status = -ENOSPC;
1324                 /* No error log here -- see the comment above
1325                  * ocfs2_test_bg_bit_allocatable */
1326         }
1327
1328         return status;
1329 }
1330
1331 int ocfs2_block_group_set_bits(handle_t *handle,
1332                                              struct inode *alloc_inode,
1333                                              struct ocfs2_group_desc *bg,
1334                                              struct buffer_head *group_bh,
1335                                              unsigned int bit_off,
1336                                              unsigned int num_bits)
1337 {
1338         int status;
1339         void *bitmap = bg->bg_bitmap;
1340         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1341
1342         /* All callers get the descriptor via
1343          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1344         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1345         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1346
1347         trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1348
1349         if (ocfs2_is_cluster_bitmap(alloc_inode))
1350                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1351
1352         status = ocfs2_journal_access_gd(handle,
1353                                          INODE_CACHE(alloc_inode),
1354                                          group_bh,
1355                                          journal_type);
1356         if (status < 0) {
1357                 mlog_errno(status);
1358                 goto bail;
1359         }
1360
1361         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1362         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1363                 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1364                             " count %u but claims %u are freed. num_bits %d",
1365                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
1366                             le16_to_cpu(bg->bg_bits),
1367                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
1368         }
1369         while(num_bits--)
1370                 ocfs2_set_bit(bit_off++, bitmap);
1371
1372         ocfs2_journal_dirty(handle, group_bh);
1373
1374 bail:
1375         return status;
1376 }
1377
1378 /* find the one with the most empty bits */
1379 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1380 {
1381         u16 curr, best;
1382
1383         BUG_ON(!cl->cl_next_free_rec);
1384
1385         best = curr = 0;
1386         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1387                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1388                     le32_to_cpu(cl->cl_recs[best].c_free))
1389                         best = curr;
1390                 curr++;
1391         }
1392
1393         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1394         return best;
1395 }
1396
1397 static int ocfs2_relink_block_group(handle_t *handle,
1398                                     struct inode *alloc_inode,
1399                                     struct buffer_head *fe_bh,
1400                                     struct buffer_head *bg_bh,
1401                                     struct buffer_head *prev_bg_bh,
1402                                     u16 chain)
1403 {
1404         int status;
1405         /* there is a really tiny chance the journal calls could fail,
1406          * but we wouldn't want inconsistent blocks in *any* case. */
1407         u64 bg_ptr, prev_bg_ptr;
1408         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1409         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1410         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1411
1412         /* The caller got these descriptors from
1413          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1414         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1415         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1416
1417         trace_ocfs2_relink_block_group(
1418                 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1419                 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1420                 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1421
1422         bg_ptr = le64_to_cpu(bg->bg_next_group);
1423         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1424
1425         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1426                                          prev_bg_bh,
1427                                          OCFS2_JOURNAL_ACCESS_WRITE);
1428         if (status < 0)
1429                 goto out;
1430
1431         prev_bg->bg_next_group = bg->bg_next_group;
1432         ocfs2_journal_dirty(handle, prev_bg_bh);
1433
1434         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1435                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1436         if (status < 0)
1437                 goto out_rollback_prev_bg;
1438
1439         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1440         ocfs2_journal_dirty(handle, bg_bh);
1441
1442         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1443                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1444         if (status < 0)
1445                 goto out_rollback_bg;
1446
1447         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1448         ocfs2_journal_dirty(handle, fe_bh);
1449
1450 out:
1451         if (status < 0)
1452                 mlog_errno(status);
1453         return status;
1454
1455 out_rollback_bg:
1456         bg->bg_next_group = cpu_to_le64(bg_ptr);
1457 out_rollback_prev_bg:
1458         prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1459         goto out;
1460 }
1461
1462 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1463                                                      u32 wanted)
1464 {
1465         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1466 }
1467
1468 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1469  * value on error. */
1470 static int ocfs2_cluster_group_search(struct inode *inode,
1471                                       struct buffer_head *group_bh,
1472                                       u32 bits_wanted, u32 min_bits,
1473                                       u64 max_block,
1474                                       struct ocfs2_suballoc_result *res)
1475 {
1476         int search = -ENOSPC;
1477         int ret;
1478         u64 blkoff;
1479         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1480         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1481         unsigned int max_bits, gd_cluster_off;
1482
1483         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1484
1485         if (gd->bg_free_bits_count) {
1486                 max_bits = le16_to_cpu(gd->bg_bits);
1487
1488                 /* Tail groups in cluster bitmaps which aren't cpg
1489                  * aligned are prone to partial extension by a failed
1490                  * fs resize. If the file system resize never got to
1491                  * update the dinode cluster count, then we don't want
1492                  * to trust any clusters past it, regardless of what
1493                  * the group descriptor says. */
1494                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1495                                                           le64_to_cpu(gd->bg_blkno));
1496                 if ((gd_cluster_off + max_bits) >
1497                     OCFS2_I(inode)->ip_clusters) {
1498                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1499                         trace_ocfs2_cluster_group_search_wrong_max_bits(
1500                                 (unsigned long long)le64_to_cpu(gd->bg_blkno),
1501                                 le16_to_cpu(gd->bg_bits),
1502                                 OCFS2_I(inode)->ip_clusters, max_bits);
1503                 }
1504
1505                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1506                                                         group_bh, bits_wanted,
1507                                                         max_bits, res);
1508                 if (ret)
1509                         return ret;
1510
1511                 if (max_block) {
1512                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1513                                                           gd_cluster_off +
1514                                                           res->sr_bit_offset +
1515                                                           res->sr_bits);
1516                         trace_ocfs2_cluster_group_search_max_block(
1517                                 (unsigned long long)blkoff,
1518                                 (unsigned long long)max_block);
1519                         if (blkoff > max_block)
1520                                 return -ENOSPC;
1521                 }
1522
1523                 /* ocfs2_block_group_find_clear_bits() might
1524                  * return success, but we still want to return
1525                  * -ENOSPC unless it found the minimum number
1526                  * of bits. */
1527                 if (min_bits <= res->sr_bits)
1528                         search = 0; /* success */
1529                 else if (res->sr_bits) {
1530                         /*
1531                          * Don't show bits which we'll be returning
1532                          * for allocation to the local alloc bitmap.
1533                          */
1534                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1535                 }
1536         }
1537
1538         return search;
1539 }
1540
1541 static int ocfs2_block_group_search(struct inode *inode,
1542                                     struct buffer_head *group_bh,
1543                                     u32 bits_wanted, u32 min_bits,
1544                                     u64 max_block,
1545                                     struct ocfs2_suballoc_result *res)
1546 {
1547         int ret = -ENOSPC;
1548         u64 blkoff;
1549         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1550
1551         BUG_ON(min_bits != 1);
1552         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1553
1554         if (bg->bg_free_bits_count) {
1555                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1556                                                         group_bh, bits_wanted,
1557                                                         le16_to_cpu(bg->bg_bits),
1558                                                         res);
1559                 if (!ret && max_block) {
1560                         blkoff = le64_to_cpu(bg->bg_blkno) +
1561                                 res->sr_bit_offset + res->sr_bits;
1562                         trace_ocfs2_block_group_search_max_block(
1563                                 (unsigned long long)blkoff,
1564                                 (unsigned long long)max_block);
1565                         if (blkoff > max_block)
1566                                 ret = -ENOSPC;
1567                 }
1568         }
1569
1570         return ret;
1571 }
1572
1573 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1574                                        handle_t *handle,
1575                                        struct buffer_head *di_bh,
1576                                        u32 num_bits,
1577                                        u16 chain)
1578 {
1579         int ret;
1580         u32 tmp_used;
1581         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1582         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1583
1584         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1585                                       OCFS2_JOURNAL_ACCESS_WRITE);
1586         if (ret < 0) {
1587                 mlog_errno(ret);
1588                 goto out;
1589         }
1590
1591         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1592         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1593         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1594         ocfs2_journal_dirty(handle, di_bh);
1595
1596 out:
1597         return ret;
1598 }
1599
1600 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1601                                        struct buffer_head *di_bh,
1602                                        u32 num_bits,
1603                                        u16 chain)
1604 {
1605         u32 tmp_used;
1606         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1607         struct ocfs2_chain_list *cl;
1608
1609         cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1610         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1611         di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1612         le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1613 }
1614
1615 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1616                                          struct ocfs2_extent_rec *rec,
1617                                          struct ocfs2_chain_list *cl)
1618 {
1619         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1620         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1621         unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1622
1623         if (res->sr_bit_offset < bitoff)
1624                 return 0;
1625         if (res->sr_bit_offset >= (bitoff + bitcount))
1626                 return 0;
1627         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1628                 (res->sr_bit_offset - bitoff);
1629         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1630                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1631         return 1;
1632 }
1633
1634 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1635                                           struct ocfs2_group_desc *bg,
1636                                           struct ocfs2_suballoc_result *res)
1637 {
1638         int i;
1639         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1640         struct ocfs2_extent_rec *rec;
1641         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1642         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1643
1644         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1645                 res->sr_blkno = 0;
1646                 return;
1647         }
1648
1649         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1650         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1651         if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1652             !bg->bg_list.l_next_free_rec)
1653                 return;
1654
1655         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1656                 rec = &bg->bg_list.l_recs[i];
1657                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1658                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1659                         break;
1660                 }
1661         }
1662 }
1663
1664 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1665                                   handle_t *handle,
1666                                   u32 bits_wanted,
1667                                   u32 min_bits,
1668                                   struct ocfs2_suballoc_result *res,
1669                                   u16 *bits_left)
1670 {
1671         int ret;
1672         struct buffer_head *group_bh = NULL;
1673         struct ocfs2_group_desc *gd;
1674         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1675         struct inode *alloc_inode = ac->ac_inode;
1676
1677         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1678                                           res->sr_bg_blkno, &group_bh);
1679         if (ret < 0) {
1680                 mlog_errno(ret);
1681                 return ret;
1682         }
1683
1684         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1685         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1686                                   ac->ac_max_block, res);
1687         if (ret < 0) {
1688                 if (ret != -ENOSPC)
1689                         mlog_errno(ret);
1690                 goto out;
1691         }
1692
1693         if (!ret)
1694                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1695
1696         /*
1697          * sr_bg_blkno might have been changed by
1698          * ocfs2_bg_discontig_fix_result
1699          */
1700         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1701
1702         if (ac->ac_find_loc_only)
1703                 goto out_loc_only;
1704
1705         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1706                                                res->sr_bits,
1707                                                le16_to_cpu(gd->bg_chain));
1708         if (ret < 0) {
1709                 mlog_errno(ret);
1710                 goto out;
1711         }
1712
1713         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1714                                          res->sr_bit_offset, res->sr_bits);
1715         if (ret < 0) {
1716                 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1717                                                res->sr_bits,
1718                                                le16_to_cpu(gd->bg_chain));
1719                 mlog_errno(ret);
1720         }
1721
1722 out_loc_only:
1723         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1724
1725 out:
1726         brelse(group_bh);
1727
1728         return ret;
1729 }
1730
1731 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1732                               handle_t *handle,
1733                               u32 bits_wanted,
1734                               u32 min_bits,
1735                               struct ocfs2_suballoc_result *res,
1736                               u16 *bits_left)
1737 {
1738         int status;
1739         u16 chain;
1740         u64 next_group;
1741         struct inode *alloc_inode = ac->ac_inode;
1742         struct buffer_head *group_bh = NULL;
1743         struct buffer_head *prev_group_bh = NULL;
1744         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1745         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1746         struct ocfs2_group_desc *bg;
1747
1748         chain = ac->ac_chain;
1749         trace_ocfs2_search_chain_begin(
1750                 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1751                 bits_wanted, chain);
1752
1753         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1754                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1755                                              &group_bh);
1756         if (status < 0) {
1757                 mlog_errno(status);
1758                 goto bail;
1759         }
1760         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1761
1762         status = -ENOSPC;
1763         /* for now, the chain search is a bit simplistic. We just use
1764          * the 1st group with any empty bits. */
1765         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1766                                              bits_wanted, min_bits,
1767                                              ac->ac_max_block,
1768                                              res)) == -ENOSPC) {
1769                 if (!bg->bg_next_group)
1770                         break;
1771
1772                 brelse(prev_group_bh);
1773                 prev_group_bh = NULL;
1774
1775                 next_group = le64_to_cpu(bg->bg_next_group);
1776                 prev_group_bh = group_bh;
1777                 group_bh = NULL;
1778                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1779                                                      next_group, &group_bh);
1780                 if (status < 0) {
1781                         mlog_errno(status);
1782                         goto bail;
1783                 }
1784                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1785         }
1786         if (status < 0) {
1787                 if (status != -ENOSPC)
1788                         mlog_errno(status);
1789                 goto bail;
1790         }
1791
1792         trace_ocfs2_search_chain_succ(
1793                 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1794
1795         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1796
1797         BUG_ON(res->sr_bits == 0);
1798         if (!status)
1799                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1800
1801         /*
1802          * sr_bg_blkno might have been changed by
1803          * ocfs2_bg_discontig_fix_result
1804          */
1805         res->sr_bg_stable_blkno = group_bh->b_blocknr;
1806
1807         /*
1808          * Keep track of previous block descriptor read. When
1809          * we find a target, if we have read more than X
1810          * number of descriptors, and the target is reasonably
1811          * empty, relink him to top of his chain.
1812          *
1813          * We've read 0 extra blocks and only send one more to
1814          * the transaction, yet the next guy to search has a
1815          * much easier time.
1816          *
1817          * Do this *after* figuring out how many bits we're taking out
1818          * of our target group.
1819          */
1820         if (!ac->ac_disable_chain_relink &&
1821             (prev_group_bh) &&
1822             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1823                 status = ocfs2_relink_block_group(handle, alloc_inode,
1824                                                   ac->ac_bh, group_bh,
1825                                                   prev_group_bh, chain);
1826                 if (status < 0) {
1827                         mlog_errno(status);
1828                         goto bail;
1829                 }
1830         }
1831
1832         if (ac->ac_find_loc_only)
1833                 goto out_loc_only;
1834
1835         status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1836                                                   ac->ac_bh, res->sr_bits,
1837                                                   chain);
1838         if (status) {
1839                 mlog_errno(status);
1840                 goto bail;
1841         }
1842
1843         status = ocfs2_block_group_set_bits(handle,
1844                                             alloc_inode,
1845                                             bg,
1846                                             group_bh,
1847                                             res->sr_bit_offset,
1848                                             res->sr_bits);
1849         if (status < 0) {
1850                 ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1851                                         ac->ac_bh, res->sr_bits, chain);
1852                 mlog_errno(status);
1853                 goto bail;
1854         }
1855
1856         trace_ocfs2_search_chain_end(
1857                         (unsigned long long)le64_to_cpu(fe->i_blkno),
1858                         res->sr_bits);
1859
1860 out_loc_only:
1861         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1862 bail:
1863         brelse(group_bh);
1864         brelse(prev_group_bh);
1865
1866         if (status)
1867                 mlog_errno(status);
1868         return status;
1869 }
1870
1871 /* will give out up to bits_wanted contiguous bits. */
1872 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1873                                      handle_t *handle,
1874                                      u32 bits_wanted,
1875                                      u32 min_bits,
1876                                      struct ocfs2_suballoc_result *res)
1877 {
1878         int status;
1879         u16 victim, i;
1880         u16 bits_left = 0;
1881         u64 hint = ac->ac_last_group;
1882         struct ocfs2_chain_list *cl;
1883         struct ocfs2_dinode *fe;
1884
1885         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1886         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1887         BUG_ON(!ac->ac_bh);
1888
1889         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1890
1891         /* The bh was validated by the inode read during
1892          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1893         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1894
1895         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1896             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1897                 status = ocfs2_error(ac->ac_inode->i_sb,
1898                             "Chain allocator dinode %llu has %u used "
1899                             "bits but only %u total.",
1900                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1901                             le32_to_cpu(fe->id1.bitmap1.i_used),
1902                             le32_to_cpu(fe->id1.bitmap1.i_total));
1903                 goto bail;
1904         }
1905
1906         res->sr_bg_blkno = hint;
1907         if (res->sr_bg_blkno) {
1908                 /* Attempt to short-circuit the usual search mechanism
1909                  * by jumping straight to the most recently used
1910                  * allocation group. This helps us maintain some
1911                  * contiguousness across allocations. */
1912                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1913                                                 min_bits, res, &bits_left);
1914                 if (!status)
1915                         goto set_hint;
1916                 if (status < 0 && status != -ENOSPC) {
1917                         mlog_errno(status);
1918                         goto bail;
1919                 }
1920         }
1921
1922         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1923
1924         victim = ocfs2_find_victim_chain(cl);
1925         ac->ac_chain = victim;
1926
1927         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1928                                     res, &bits_left);
1929         if (!status) {
1930                 hint = ocfs2_group_from_res(res);
1931                 goto set_hint;
1932         }
1933         if (status < 0 && status != -ENOSPC) {
1934                 mlog_errno(status);
1935                 goto bail;
1936         }
1937
1938         trace_ocfs2_claim_suballoc_bits(victim);
1939
1940         /* If we didn't pick a good victim, then just default to
1941          * searching each chain in order. Don't allow chain relinking
1942          * because we only calculate enough journal credits for one
1943          * relink per alloc. */
1944         ac->ac_disable_chain_relink = 1;
1945         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1946                 if (i == victim)
1947                         continue;
1948                 if (!cl->cl_recs[i].c_free)
1949                         continue;
1950
1951                 ac->ac_chain = i;
1952                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1953                                             res, &bits_left);
1954                 if (!status) {
1955                         hint = ocfs2_group_from_res(res);
1956                         break;
1957                 }
1958                 if (status < 0 && status != -ENOSPC) {
1959                         mlog_errno(status);
1960                         goto bail;
1961                 }
1962         }
1963
1964 set_hint:
1965         if (status != -ENOSPC) {
1966                 /* If the next search of this group is not likely to
1967                  * yield a suitable extent, then we reset the last
1968                  * group hint so as to not waste a disk read */
1969                 if (bits_left < min_bits)
1970                         ac->ac_last_group = 0;
1971                 else
1972                         ac->ac_last_group = hint;
1973         }
1974
1975 bail:
1976         if (status)
1977                 mlog_errno(status);
1978         return status;
1979 }
1980
1981 int ocfs2_claim_metadata(handle_t *handle,
1982                          struct ocfs2_alloc_context *ac,
1983                          u32 bits_wanted,
1984                          u64 *suballoc_loc,
1985                          u16 *suballoc_bit_start,
1986                          unsigned int *num_bits,
1987                          u64 *blkno_start)
1988 {
1989         int status;
1990         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1991
1992         BUG_ON(!ac);
1993         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1994         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1995
1996         status = ocfs2_claim_suballoc_bits(ac,
1997                                            handle,
1998                                            bits_wanted,
1999                                            1,
2000                                            &res);
2001         if (status < 0) {
2002                 mlog_errno(status);
2003                 goto bail;
2004         }
2005         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2006
2007         *suballoc_loc = res.sr_bg_blkno;
2008         *suballoc_bit_start = res.sr_bit_offset;
2009         *blkno_start = res.sr_blkno;
2010         ac->ac_bits_given += res.sr_bits;
2011         *num_bits = res.sr_bits;
2012         status = 0;
2013 bail:
2014         if (status)
2015                 mlog_errno(status);
2016         return status;
2017 }
2018
2019 static void ocfs2_init_inode_ac_group(struct inode *dir,
2020                                       struct buffer_head *parent_di_bh,
2021                                       struct ocfs2_alloc_context *ac)
2022 {
2023         struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2024         /*
2025          * Try to allocate inodes from some specific group.
2026          *
2027          * If the parent dir has recorded the last group used in allocation,
2028          * cool, use it. Otherwise if we try to allocate new inode from the
2029          * same slot the parent dir belongs to, use the same chunk.
2030          *
2031          * We are very careful here to avoid the mistake of setting
2032          * ac_last_group to a group descriptor from a different (unlocked) slot.
2033          */
2034         if (OCFS2_I(dir)->ip_last_used_group &&
2035             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2036                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2037         else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2038                 if (di->i_suballoc_loc)
2039                         ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2040                 else
2041                         ac->ac_last_group = ocfs2_which_suballoc_group(
2042                                         le64_to_cpu(di->i_blkno),
2043                                         le16_to_cpu(di->i_suballoc_bit));
2044         }
2045 }
2046
2047 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2048                                              struct ocfs2_alloc_context *ac)
2049 {
2050         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2051         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2052 }
2053
2054 int ocfs2_find_new_inode_loc(struct inode *dir,
2055                              struct buffer_head *parent_fe_bh,
2056                              struct ocfs2_alloc_context *ac,
2057                              u64 *fe_blkno)
2058 {
2059         int ret;
2060         handle_t *handle = NULL;
2061         struct ocfs2_suballoc_result *res;
2062
2063         BUG_ON(!ac);
2064         BUG_ON(ac->ac_bits_given != 0);
2065         BUG_ON(ac->ac_bits_wanted != 1);
2066         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2067
2068         res = kzalloc(sizeof(*res), GFP_NOFS);
2069         if (res == NULL) {
2070                 ret = -ENOMEM;
2071                 mlog_errno(ret);
2072                 goto out;
2073         }
2074
2075         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2076
2077         /*
2078          * The handle started here is for chain relink. Alternatively,
2079          * we could just disable relink for these calls.
2080          */
2081         handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2082         if (IS_ERR(handle)) {
2083                 ret = PTR_ERR(handle);
2084                 handle = NULL;
2085                 mlog_errno(ret);
2086                 goto out;
2087         }
2088
2089         /*
2090          * This will instruct ocfs2_claim_suballoc_bits and
2091          * ocfs2_search_one_group to search but save actual allocation
2092          * for later.
2093          */
2094         ac->ac_find_loc_only = 1;
2095
2096         ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2097         if (ret < 0) {
2098                 mlog_errno(ret);
2099                 goto out;
2100         }
2101
2102         ac->ac_find_loc_priv = res;
2103         *fe_blkno = res->sr_blkno;
2104         ocfs2_update_inode_fsync_trans(handle, dir, 0);
2105 out:
2106         if (handle)
2107                 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2108
2109         if (ret)
2110                 kfree(res);
2111
2112         return ret;
2113 }
2114
2115 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2116                                  struct inode *dir,
2117                                  struct ocfs2_alloc_context *ac,
2118                                  u64 *suballoc_loc,
2119                                  u16 *suballoc_bit,
2120                                  u64 di_blkno)
2121 {
2122         int ret;
2123         u16 chain;
2124         struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2125         struct buffer_head *bg_bh = NULL;
2126         struct ocfs2_group_desc *bg;
2127         struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2128
2129         /*
2130          * Since di_blkno is being passed back in, we check for any
2131          * inconsistencies which may have happened between
2132          * calls. These are code bugs as di_blkno is not expected to
2133          * change once returned from ocfs2_find_new_inode_loc()
2134          */
2135         BUG_ON(res->sr_blkno != di_blkno);
2136
2137         ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2138                                           res->sr_bg_stable_blkno, &bg_bh);
2139         if (ret) {
2140                 mlog_errno(ret);
2141                 goto out;
2142         }
2143
2144         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2145         chain = le16_to_cpu(bg->bg_chain);
2146
2147         ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2148                                                ac->ac_bh, res->sr_bits,
2149                                                chain);
2150         if (ret) {
2151                 mlog_errno(ret);
2152                 goto out;
2153         }
2154
2155         ret = ocfs2_block_group_set_bits(handle,
2156                                          ac->ac_inode,
2157                                          bg,
2158                                          bg_bh,
2159                                          res->sr_bit_offset,
2160                                          res->sr_bits);
2161         if (ret < 0) {
2162                 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2163                                                ac->ac_bh, res->sr_bits, chain);
2164                 mlog_errno(ret);
2165                 goto out;
2166         }
2167
2168         trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2169                                            res->sr_bits);
2170
2171         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2172
2173         BUG_ON(res->sr_bits != 1);
2174
2175         *suballoc_loc = res->sr_bg_blkno;
2176         *suballoc_bit = res->sr_bit_offset;
2177         ac->ac_bits_given++;
2178         ocfs2_save_inode_ac_group(dir, ac);
2179
2180 out:
2181         brelse(bg_bh);
2182
2183         return ret;
2184 }
2185
2186 int ocfs2_claim_new_inode(handle_t *handle,
2187                           struct inode *dir,
2188                           struct buffer_head *parent_fe_bh,
2189                           struct ocfs2_alloc_context *ac,
2190                           u64 *suballoc_loc,
2191                           u16 *suballoc_bit,
2192                           u64 *fe_blkno)
2193 {
2194         int status;
2195         struct ocfs2_suballoc_result res;
2196
2197         BUG_ON(!ac);
2198         BUG_ON(ac->ac_bits_given != 0);
2199         BUG_ON(ac->ac_bits_wanted != 1);
2200         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2201
2202         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2203
2204         status = ocfs2_claim_suballoc_bits(ac,
2205                                            handle,
2206                                            1,
2207                                            1,
2208                                            &res);
2209         if (status < 0) {
2210                 mlog_errno(status);
2211                 goto bail;
2212         }
2213         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2214
2215         BUG_ON(res.sr_bits != 1);
2216
2217         *suballoc_loc = res.sr_bg_blkno;
2218         *suballoc_bit = res.sr_bit_offset;
2219         *fe_blkno = res.sr_blkno;
2220         ac->ac_bits_given++;
2221         ocfs2_save_inode_ac_group(dir, ac);
2222         status = 0;
2223 bail:
2224         if (status)
2225                 mlog_errno(status);
2226         return status;
2227 }
2228
2229 /* translate a group desc. blkno and it's bitmap offset into
2230  * disk cluster offset. */
2231 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2232                                                    u64 bg_blkno,
2233                                                    u16 bg_bit_off)
2234 {
2235         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2236         u32 cluster = 0;
2237
2238         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2239
2240         if (bg_blkno != osb->first_cluster_group_blkno)
2241                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2242         cluster += (u32) bg_bit_off;
2243         return cluster;
2244 }
2245
2246 /* given a cluster offset, calculate which block group it belongs to
2247  * and return that block offset. */
2248 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2249 {
2250         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2251         u32 group_no;
2252
2253         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2254
2255         group_no = cluster / osb->bitmap_cpg;
2256         if (!group_no)
2257                 return osb->first_cluster_group_blkno;
2258         return ocfs2_clusters_to_blocks(inode->i_sb,
2259                                         group_no * osb->bitmap_cpg);
2260 }
2261
2262 /* given the block number of a cluster start, calculate which cluster
2263  * group and descriptor bitmap offset that corresponds to. */
2264 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2265                                                 u64 data_blkno,
2266                                                 u64 *bg_blkno,
2267                                                 u16 *bg_bit_off)
2268 {
2269         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2270         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2271
2272         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2273
2274         *bg_blkno = ocfs2_which_cluster_group(inode,
2275                                               data_cluster);
2276
2277         if (*bg_blkno == osb->first_cluster_group_blkno)
2278                 *bg_bit_off = (u16) data_cluster;
2279         else
2280                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2281                                                              data_blkno - *bg_blkno);
2282 }
2283
2284 /*
2285  * min_bits - minimum contiguous chunk from this total allocation we
2286  * can handle. set to what we asked for originally for a full
2287  * contig. allocation, set to '1' to indicate we can deal with extents
2288  * of any size.
2289  */
2290 int __ocfs2_claim_clusters(handle_t *handle,
2291                            struct ocfs2_alloc_context *ac,
2292                            u32 min_clusters,
2293                            u32 max_clusters,
2294                            u32 *cluster_start,
2295                            u32 *num_clusters)
2296 {
2297         int status;
2298         unsigned int bits_wanted = max_clusters;
2299         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2300         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2301
2302         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2303
2304         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2305                && ac->ac_which != OCFS2_AC_USE_MAIN);
2306
2307         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2308                 WARN_ON(min_clusters > 1);
2309
2310                 status = ocfs2_claim_local_alloc_bits(osb,
2311                                                       handle,
2312                                                       ac,
2313                                                       bits_wanted,
2314                                                       cluster_start,
2315                                                       num_clusters);
2316                 if (!status)
2317                         atomic_inc(&osb->alloc_stats.local_data);
2318         } else {
2319                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2320                         /* The only paths asking for contiguousness
2321                          * should know about this already. */
2322                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2323                              "group bitmap size %u!\n", min_clusters,
2324                              osb->bitmap_cpg);
2325                         status = -ENOSPC;
2326                         goto bail;
2327                 }
2328                 /* clamp the current request down to a realistic size. */
2329                 if (bits_wanted > (osb->bitmap_cpg - 1))
2330                         bits_wanted = osb->bitmap_cpg - 1;
2331
2332                 status = ocfs2_claim_suballoc_bits(ac,
2333                                                    handle,
2334                                                    bits_wanted,
2335                                                    min_clusters,
2336                                                    &res);
2337                 if (!status) {
2338                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2339                         *cluster_start =
2340                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2341                                                                  res.sr_bg_blkno,
2342                                                                  res.sr_bit_offset);
2343                         atomic_inc(&osb->alloc_stats.bitmap_data);
2344                         *num_clusters = res.sr_bits;
2345                 }
2346         }
2347         if (status < 0) {
2348                 if (status != -ENOSPC)
2349                         mlog_errno(status);
2350                 goto bail;
2351         }
2352
2353         ac->ac_bits_given += *num_clusters;
2354
2355 bail:
2356         if (status)
2357                 mlog_errno(status);
2358         return status;
2359 }
2360
2361 int ocfs2_claim_clusters(handle_t *handle,
2362                          struct ocfs2_alloc_context *ac,
2363                          u32 min_clusters,
2364                          u32 *cluster_start,
2365                          u32 *num_clusters)
2366 {
2367         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2368
2369         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2370                                       bits_wanted, cluster_start, num_clusters);
2371 }
2372
2373 static int ocfs2_block_group_clear_bits(handle_t *handle,
2374                                         struct inode *alloc_inode,
2375                                         struct ocfs2_group_desc *bg,
2376                                         struct buffer_head *group_bh,
2377                                         unsigned int bit_off,
2378                                         unsigned int num_bits,
2379                                         void (*undo_fn)(unsigned int bit,
2380                                                         unsigned long *bmap))
2381 {
2382         int status;
2383         unsigned int tmp;
2384         struct ocfs2_group_desc *undo_bg = NULL;
2385
2386         /* The caller got this descriptor from
2387          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2388         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2389
2390         trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2391
2392         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2393         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2394                                          group_bh,
2395                                          undo_fn ?
2396                                          OCFS2_JOURNAL_ACCESS_UNDO :
2397                                          OCFS2_JOURNAL_ACCESS_WRITE);
2398         if (status < 0) {
2399                 mlog_errno(status);
2400                 goto bail;
2401         }
2402
2403         if (undo_fn) {
2404                 jbd_lock_bh_state(group_bh);
2405                 undo_bg = (struct ocfs2_group_desc *)
2406                                         bh2jh(group_bh)->b_committed_data;
2407                 BUG_ON(!undo_bg);
2408         }
2409
2410         tmp = num_bits;
2411         while(tmp--) {
2412                 ocfs2_clear_bit((bit_off + tmp),
2413                                 (unsigned long *) bg->bg_bitmap);
2414                 if (undo_fn)
2415                         undo_fn(bit_off + tmp,
2416                                 (unsigned long *) undo_bg->bg_bitmap);
2417         }
2418         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2419         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2420                 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2421                             " count %u but claims %u are freed. num_bits %d",
2422                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
2423                             le16_to_cpu(bg->bg_bits),
2424                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
2425         }
2426
2427         if (undo_fn)
2428                 jbd_unlock_bh_state(group_bh);
2429
2430         ocfs2_journal_dirty(handle, group_bh);
2431 bail:
2432         return status;
2433 }
2434
2435 /*
2436  * expects the suballoc inode to already be locked.
2437  */
2438 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2439                                      struct inode *alloc_inode,
2440                                      struct buffer_head *alloc_bh,
2441                                      unsigned int start_bit,
2442                                      u64 bg_blkno,
2443                                      unsigned int count,
2444                                      void (*undo_fn)(unsigned int bit,
2445                                                      unsigned long *bitmap))
2446 {
2447         int status = 0;
2448         u32 tmp_used;
2449         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2450         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2451         struct buffer_head *group_bh = NULL;
2452         struct ocfs2_group_desc *group;
2453
2454         /* The alloc_bh comes from ocfs2_free_dinode() or
2455          * ocfs2_free_clusters().  The callers have all locked the
2456          * allocator and gotten alloc_bh from the lock call.  This
2457          * validates the dinode buffer.  Any corruption that has happened
2458          * is a code bug. */
2459         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2460         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2461
2462         trace_ocfs2_free_suballoc_bits(
2463                 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2464                 (unsigned long long)bg_blkno,
2465                 start_bit, count);
2466
2467         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2468                                              &group_bh);
2469         if (status < 0) {
2470                 mlog_errno(status);
2471                 goto bail;
2472         }
2473         group = (struct ocfs2_group_desc *) group_bh->b_data;
2474
2475         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2476
2477         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2478                                               group, group_bh,
2479                                               start_bit, count, undo_fn);
2480         if (status < 0) {
2481                 mlog_errno(status);
2482                 goto bail;
2483         }
2484
2485         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2486                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2487         if (status < 0) {
2488                 mlog_errno(status);
2489                 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2490                                 start_bit, count);
2491                 goto bail;
2492         }
2493
2494         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2495                      count);
2496         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2497         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2498         ocfs2_journal_dirty(handle, alloc_bh);
2499
2500 bail:
2501         brelse(group_bh);
2502
2503         if (status)
2504                 mlog_errno(status);
2505         return status;
2506 }
2507
2508 int ocfs2_free_suballoc_bits(handle_t *handle,
2509                              struct inode *alloc_inode,
2510                              struct buffer_head *alloc_bh,
2511                              unsigned int start_bit,
2512                              u64 bg_blkno,
2513                              unsigned int count)
2514 {
2515         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2516                                          start_bit, bg_blkno, count, NULL);
2517 }
2518
2519 int ocfs2_free_dinode(handle_t *handle,
2520                       struct inode *inode_alloc_inode,
2521                       struct buffer_head *inode_alloc_bh,
2522                       struct ocfs2_dinode *di)
2523 {
2524         u64 blk = le64_to_cpu(di->i_blkno);
2525         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2526         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2527
2528         if (di->i_suballoc_loc)
2529                 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2530         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2531                                         inode_alloc_bh, bit, bg_blkno, 1);
2532 }
2533
2534 static int _ocfs2_free_clusters(handle_t *handle,
2535                                 struct inode *bitmap_inode,
2536                                 struct buffer_head *bitmap_bh,
2537                                 u64 start_blk,
2538                                 unsigned int num_clusters,
2539                                 void (*undo_fn)(unsigned int bit,
2540                                                 unsigned long *bitmap))
2541 {
2542         int status;
2543         u16 bg_start_bit;
2544         u64 bg_blkno;
2545         struct ocfs2_dinode *fe;
2546
2547         /* You can't ever have a contiguous set of clusters
2548          * bigger than a block group bitmap so we never have to worry
2549          * about looping on them.
2550          * This is expensive. We can safely remove once this stuff has
2551          * gotten tested really well. */
2552         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2553
2554         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2555
2556         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2557                                      &bg_start_bit);
2558
2559         trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2560                         (unsigned long long)start_blk,
2561                         bg_start_bit, num_clusters);
2562
2563         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2564                                            bg_start_bit, bg_blkno,
2565                                            num_clusters, undo_fn);
2566         if (status < 0) {
2567                 mlog_errno(status);
2568                 goto out;
2569         }
2570
2571         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2572                                          num_clusters);
2573
2574 out:
2575         if (status)
2576                 mlog_errno(status);
2577         return status;
2578 }
2579
2580 int ocfs2_free_clusters(handle_t *handle,
2581                         struct inode *bitmap_inode,
2582                         struct buffer_head *bitmap_bh,
2583                         u64 start_blk,
2584                         unsigned int num_clusters)
2585 {
2586         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2587                                     start_blk, num_clusters,
2588                                     _ocfs2_set_bit);
2589 }
2590
2591 /*
2592  * Give never-used clusters back to the global bitmap.  We don't need
2593  * to protect these bits in the undo buffer.
2594  */
2595 int ocfs2_release_clusters(handle_t *handle,
2596                            struct inode *bitmap_inode,
2597                            struct buffer_head *bitmap_bh,
2598                            u64 start_blk,
2599                            unsigned int num_clusters)
2600 {
2601         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2602                                     start_blk, num_clusters,
2603                                     _ocfs2_clear_bit);
2604 }
2605
2606 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2607 {
2608         printk("Block Group:\n");
2609         printk("bg_signature:       %s\n", bg->bg_signature);
2610         printk("bg_size:            %u\n", bg->bg_size);
2611         printk("bg_bits:            %u\n", bg->bg_bits);
2612         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2613         printk("bg_chain:           %u\n", bg->bg_chain);
2614         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2615         printk("bg_next_group:      %llu\n",
2616                (unsigned long long)bg->bg_next_group);
2617         printk("bg_parent_dinode:   %llu\n",
2618                (unsigned long long)bg->bg_parent_dinode);
2619         printk("bg_blkno:           %llu\n",
2620                (unsigned long long)bg->bg_blkno);
2621 }
2622
2623 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2624 {
2625         int i;
2626
2627         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2628         printk("i_signature:                  %s\n", fe->i_signature);
2629         printk("i_size:                       %llu\n",
2630                (unsigned long long)fe->i_size);
2631         printk("i_clusters:                   %u\n", fe->i_clusters);
2632         printk("i_generation:                 %u\n",
2633                le32_to_cpu(fe->i_generation));
2634         printk("id1.bitmap1.i_used:           %u\n",
2635                le32_to_cpu(fe->id1.bitmap1.i_used));
2636         printk("id1.bitmap1.i_total:          %u\n",
2637                le32_to_cpu(fe->id1.bitmap1.i_total));
2638         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2639         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2640         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2641         printk("id2.i_chain.cl_next_free_rec: %u\n",
2642                fe->id2.i_chain.cl_next_free_rec);
2643         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2644                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2645                        fe->id2.i_chain.cl_recs[i].c_free);
2646                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2647                        fe->id2.i_chain.cl_recs[i].c_total);
2648                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2649                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2650         }
2651 }
2652
2653 /*
2654  * For a given allocation, determine which allocators will need to be
2655  * accessed, and lock them, reserving the appropriate number of bits.
2656  *
2657  * Sparse file systems call this from ocfs2_write_begin_nolock()
2658  * and ocfs2_allocate_unwritten_extents().
2659  *
2660  * File systems which don't support holes call this from
2661  * ocfs2_extend_allocation().
2662  */
2663 int ocfs2_lock_allocators(struct inode *inode,
2664                           struct ocfs2_extent_tree *et,
2665                           u32 clusters_to_add, u32 extents_to_split,
2666                           struct ocfs2_alloc_context **data_ac,
2667                           struct ocfs2_alloc_context **meta_ac)
2668 {
2669         int ret = 0, num_free_extents;
2670         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2671         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2672
2673         *meta_ac = NULL;
2674         if (data_ac)
2675                 *data_ac = NULL;
2676
2677         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2678
2679         num_free_extents = ocfs2_num_free_extents(osb, et);
2680         if (num_free_extents < 0) {
2681                 ret = num_free_extents;
2682                 mlog_errno(ret);
2683                 goto out;
2684         }
2685
2686         /*
2687          * Sparse allocation file systems need to be more conservative
2688          * with reserving room for expansion - the actual allocation
2689          * happens while we've got a journal handle open so re-taking
2690          * a cluster lock (because we ran out of room for another
2691          * extent) will violate ordering rules.
2692          *
2693          * Most of the time we'll only be seeing this 1 cluster at a time
2694          * anyway.
2695          *
2696          * Always lock for any unwritten extents - we might want to
2697          * add blocks during a split.
2698          */
2699         if (!num_free_extents ||
2700             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2701                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2702                 if (ret < 0) {
2703                         if (ret != -ENOSPC)
2704                                 mlog_errno(ret);
2705                         goto out;
2706                 }
2707         }
2708
2709         if (clusters_to_add == 0)
2710                 goto out;
2711
2712         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2713         if (ret < 0) {
2714                 if (ret != -ENOSPC)
2715                         mlog_errno(ret);
2716                 goto out;
2717         }
2718
2719 out:
2720         if (ret) {
2721                 if (*meta_ac) {
2722                         ocfs2_free_alloc_context(*meta_ac);
2723                         *meta_ac = NULL;
2724                 }
2725
2726                 /*
2727                  * We cannot have an error and a non null *data_ac.
2728                  */
2729         }
2730
2731         return ret;
2732 }
2733
2734 /*
2735  * Read the inode specified by blkno to get suballoc_slot and
2736  * suballoc_bit.
2737  */
2738 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2739                                        u16 *suballoc_slot, u64 *group_blkno,
2740                                        u16 *suballoc_bit)
2741 {
2742         int status;
2743         struct buffer_head *inode_bh = NULL;
2744         struct ocfs2_dinode *inode_fe;
2745
2746         trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2747
2748         /* dirty read disk */
2749         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2750         if (status < 0) {
2751                 mlog(ML_ERROR, "read block %llu failed %d\n",
2752                      (unsigned long long)blkno, status);
2753                 goto bail;
2754         }
2755
2756         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2757         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2758                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2759                      (unsigned long long)blkno);
2760                 status = -EINVAL;
2761                 goto bail;
2762         }
2763
2764         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2765             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2766                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2767                      (unsigned long long)blkno,
2768                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2769                 status = -EINVAL;
2770                 goto bail;
2771         }
2772
2773         if (suballoc_slot)
2774                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2775         if (suballoc_bit)
2776                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2777         if (group_blkno)
2778                 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2779
2780 bail:
2781         brelse(inode_bh);
2782
2783         if (status)
2784                 mlog_errno(status);
2785         return status;
2786 }
2787
2788 /*
2789  * test whether bit is SET in allocator bitmap or not.  on success, 0
2790  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2791  * is returned and *res is meaningless.  Call this after you have
2792  * cluster locked against suballoc, or you may get a result based on
2793  * non-up2date contents
2794  */
2795 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2796                                    struct inode *suballoc,
2797                                    struct buffer_head *alloc_bh,
2798                                    u64 group_blkno, u64 blkno,
2799                                    u16 bit, int *res)
2800 {
2801         struct ocfs2_dinode *alloc_di;
2802         struct ocfs2_group_desc *group;
2803         struct buffer_head *group_bh = NULL;
2804         u64 bg_blkno;
2805         int status;
2806
2807         trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2808                                       (unsigned int)bit);
2809
2810         alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2811         if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2812                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2813                      (unsigned int)bit,
2814                      ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2815                 status = -EINVAL;
2816                 goto bail;
2817         }
2818
2819         bg_blkno = group_blkno ? group_blkno :
2820                    ocfs2_which_suballoc_group(blkno, bit);
2821         status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2822                                              &group_bh);
2823         if (status < 0) {
2824                 mlog(ML_ERROR, "read group %llu failed %d\n",
2825                      (unsigned long long)bg_blkno, status);
2826                 goto bail;
2827         }
2828
2829         group = (struct ocfs2_group_desc *) group_bh->b_data;
2830         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2831
2832 bail:
2833         brelse(group_bh);
2834
2835         if (status)
2836                 mlog_errno(status);
2837         return status;
2838 }
2839
2840 /*
2841  * Test if the bit representing this inode (blkno) is set in the
2842  * suballocator.
2843  *
2844  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2845  *
2846  * In the event of failure, a negative value is returned and *res is
2847  * meaningless.
2848  *
2849  * Callers must make sure to hold nfs_sync_lock to prevent
2850  * ocfs2_delete_inode() on another node from accessing the same
2851  * suballocator concurrently.
2852  */
2853 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2854 {
2855         int status;
2856         u64 group_blkno = 0;
2857         u16 suballoc_bit = 0, suballoc_slot = 0;
2858         struct inode *inode_alloc_inode;
2859         struct buffer_head *alloc_bh = NULL;
2860
2861         trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2862
2863         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2864                                              &group_blkno, &suballoc_bit);
2865         if (status < 0) {
2866                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2867                 goto bail;
2868         }
2869
2870         inode_alloc_inode =
2871                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2872                                             suballoc_slot);
2873         if (!inode_alloc_inode) {
2874                 /* the error code could be inaccurate, but we are not able to
2875                  * get the correct one. */
2876                 status = -EINVAL;
2877                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2878                      (u32)suballoc_slot);
2879                 goto bail;
2880         }
2881
2882         mutex_lock(&inode_alloc_inode->i_mutex);
2883         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2884         if (status < 0) {
2885                 mutex_unlock(&inode_alloc_inode->i_mutex);
2886                 iput(inode_alloc_inode);
2887                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2888                      (u32)suballoc_slot, status);
2889                 goto bail;
2890         }
2891
2892         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2893                                          group_blkno, blkno, suballoc_bit, res);
2894         if (status < 0)
2895                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2896
2897         ocfs2_inode_unlock(inode_alloc_inode, 0);
2898         mutex_unlock(&inode_alloc_inode->i_mutex);
2899
2900         iput(inode_alloc_inode);
2901         brelse(alloc_bh);
2902 bail:
2903         if (status)
2904                 mlog_errno(status);
2905         return status;
2906 }