1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
33 #include "trace_gfs2.h"
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
40 struct gfs2_rgrpd *rgd;
41 u32 offset; /* The offset is bitmap relative */
42 int bii; /* Bitmap index */
45 static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
47 return rbm->rgd->rd_bits + rbm->bii;
50 static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
52 BUG_ON(rbm->offset >= rbm->rgd->rd_data);
53 return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
58 * These routines are used by the resource group routines (rgrp.c)
59 * to keep track of block allocation. Each block is represented by two
60 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
63 * 1 = Used (not metadata)
64 * 2 = Unlinked (still in use) inode
73 static const char valid_change[16] = {
81 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
82 struct gfs2_blkreserv *rs, bool nowrap);
86 * gfs2_setbit - Set a bit in the bitmaps
87 * @rbm: The position of the bit to set
88 * @do_clone: Also set the clone bitmap, if it exists
89 * @new_state: the new state of the block
93 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
94 unsigned char new_state)
96 unsigned char *byte1, *byte2, *end, cur_state;
97 struct gfs2_bitmap *bi = rbm_bi(rbm);
98 unsigned int buflen = bi->bi_bytes;
99 const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
101 byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
102 end = bi->bi_bh->b_data + bi->bi_offset + buflen;
104 BUG_ON(byte1 >= end);
106 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
108 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
109 struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
111 fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
112 rbm->offset, cur_state, new_state);
113 fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
114 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
115 (unsigned long long)bi->bi_bh->b_blocknr);
116 fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
117 bi->bi_offset, bi->bi_bytes,
118 (unsigned long long)gfs2_rbm_to_block(rbm));
120 gfs2_consist_rgrpd(rbm->rgd);
123 *byte1 ^= (cur_state ^ new_state) << bit;
125 if (do_clone && bi->bi_clone) {
126 byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
127 cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
128 *byte2 ^= (cur_state ^ new_state) << bit;
133 * gfs2_testbit - test a bit in the bitmaps
134 * @rbm: The bit to test
135 * @use_clone: If true, test the clone bitmap, not the official bitmap.
137 * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
138 * not the "real" bitmaps, to avoid allocating recently freed blocks.
140 * Returns: The two bit block state of the requested bit
143 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
145 struct gfs2_bitmap *bi = rbm_bi(rbm);
150 if (use_clone && bi->bi_clone)
151 buffer = bi->bi_clone;
153 buffer = bi->bi_bh->b_data;
154 buffer += bi->bi_offset;
155 byte = buffer + (rbm->offset / GFS2_NBBY);
156 bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
158 return (*byte >> bit) & GFS2_BIT_MASK;
163 * @ptr: Pointer to bitmap data
164 * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
165 * @state: The state we are searching for
167 * We xor the bitmap data with a patter which is the bitwise opposite
168 * of what we are looking for, this gives rise to a pattern of ones
169 * wherever there is a match. Since we have two bits per entry, we
170 * take this pattern, shift it down by one place and then and it with
171 * the original. All the even bit positions (0,2,4, etc) then represent
172 * successful matches, so we mask with 0x55555..... to remove the unwanted
175 * This allows searching of a whole u64 at once (32 blocks) with a
176 * single test (on 64 bit arches).
179 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
182 static const u64 search[] = {
183 [0] = 0xffffffffffffffffULL,
184 [1] = 0xaaaaaaaaaaaaaaaaULL,
185 [2] = 0x5555555555555555ULL,
186 [3] = 0x0000000000000000ULL,
188 tmp = le64_to_cpu(*ptr) ^ search[state];
195 * rs_cmp - multi-block reservation range compare
196 * @start: start of the new reservation
197 * @len: number of blocks in the new reservation
198 * @rs: existing reservation to compare against
200 * returns: 1 if the block range is beyond the reach of the reservation
201 * -1 if the block range is before the start of the reservation
202 * 0 if the block range overlaps with the reservation
204 static inline int rs_cmp(u64 start, u32 len, struct gfs2_blkreserv *rs)
206 if (start >= rs->rs_start + rs->rs_requested)
208 if (rs->rs_start >= start + len)
214 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
215 * a block in a given allocation state.
216 * @buf: the buffer that holds the bitmaps
217 * @len: the length (in bytes) of the buffer
218 * @goal: start search at this block's bit-pair (within @buffer)
219 * @state: GFS2_BLKST_XXX the state of the block we're looking for.
221 * Scope of @goal and returned block number is only within this bitmap buffer,
222 * not entire rgrp or filesystem. @buffer will be offset from the actual
223 * beginning of a bitmap block buffer, skipping any header structures, but
224 * headers are always a multiple of 64 bits long so that the buffer is
225 * always aligned to a 64 bit boundary.
227 * The size of the buffer is in bytes, but is it assumed that it is
228 * always ok to read a complete multiple of 64 bits at the end
229 * of the block in case the end is no aligned to a natural boundary.
231 * Return: the block number (bitmap buffer scope) that was found
234 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
237 u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
238 const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
239 const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
241 u64 mask = 0x5555555555555555ULL;
244 /* Mask off bits we don't care about at the start of the search */
246 tmp = gfs2_bit_search(ptr, mask, state);
248 while(tmp == 0 && ptr < end) {
249 tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
252 /* Mask off any bits which are more than len bytes from the start */
253 if (ptr == end && (len & (sizeof(u64) - 1)))
254 tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
255 /* Didn't find anything, so return */
260 bit /= 2; /* two bits per entry in the bitmap */
261 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
265 * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
266 * @rbm: The rbm with rgd already set correctly
267 * @block: The block number (filesystem relative)
269 * This sets the bi and offset members of an rbm based on a
270 * resource group and a filesystem relative block number. The
271 * resource group must be set in the rbm on entry, the bi and
272 * offset members will be set by this function.
274 * Returns: 0 on success, or an error code
277 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
279 if (!rgrp_contains_block(rbm->rgd, block))
282 rbm->offset = block - rbm->rgd->rd_data0;
283 /* Check if the block is within the first block */
284 if (rbm->offset < rbm_bi(rbm)->bi_blocks)
287 /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
288 rbm->offset += (sizeof(struct gfs2_rgrp) -
289 sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
290 rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
291 rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
296 * gfs2_rbm_add - add a number of blocks to an rbm
297 * @rbm: The rbm with rgd already set correctly
298 * @blocks: The number of blocks to add to rpm
300 * This function takes an existing rbm structure and adds a number of blocks to
303 * Returns: True if the new rbm would point past the end of the rgrp.
306 static bool gfs2_rbm_add(struct gfs2_rbm *rbm, u32 blocks)
308 struct gfs2_rgrpd *rgd = rbm->rgd;
309 struct gfs2_bitmap *bi = rgd->rd_bits + rbm->bii;
311 if (rbm->offset + blocks < bi->bi_blocks) {
312 rbm->offset += blocks;
315 blocks -= bi->bi_blocks - rbm->offset;
319 if (bi == rgd->rd_bits + rgd->rd_length)
321 if (blocks < bi->bi_blocks) {
322 rbm->offset = blocks;
323 rbm->bii = bi - rgd->rd_bits;
326 blocks -= bi->bi_blocks;
331 * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
332 * @rbm: Position to search (value/result)
333 * @n_unaligned: Number of unaligned blocks to check
334 * @len: Decremented for each block found (terminate on zero)
336 * Returns: true if a non-free block is encountered or the end of the resource
340 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
345 for (n = 0; n < n_unaligned; n++) {
346 res = gfs2_testbit(rbm, true);
347 if (res != GFS2_BLKST_FREE)
352 if (gfs2_rbm_add(rbm, 1))
360 * gfs2_free_extlen - Return extent length of free blocks
361 * @rrbm: Starting position
362 * @len: Max length to check
364 * Starting at the block specified by the rbm, see how many free blocks
365 * there are, not reading more than len blocks ahead. This can be done
366 * using memchr_inv when the blocks are byte aligned, but has to be done
367 * on a block by block basis in case of unaligned blocks. Also this
368 * function can cope with bitmap boundaries (although it must stop on
369 * a resource group boundary)
371 * Returns: Number of free blocks in the extent
374 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
376 struct gfs2_rbm rbm = *rrbm;
377 u32 n_unaligned = rbm.offset & 3;
381 u8 *ptr, *start, *end;
383 struct gfs2_bitmap *bi;
386 gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
389 n_unaligned = len & 3;
390 /* Start is now byte aligned */
393 start = bi->bi_bh->b_data;
395 start = bi->bi_clone;
396 start += bi->bi_offset;
397 end = start + bi->bi_bytes;
398 BUG_ON(rbm.offset & 3);
399 start += (rbm.offset / GFS2_NBBY);
400 bytes = min_t(u32, len / GFS2_NBBY, (end - start));
401 ptr = memchr_inv(start, 0, bytes);
402 chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
403 chunk_size *= GFS2_NBBY;
404 BUG_ON(len < chunk_size);
406 block = gfs2_rbm_to_block(&rbm);
407 if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
415 n_unaligned = len & 3;
418 /* Deal with any bits left over at the end */
420 gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
426 * gfs2_bitcount - count the number of bits in a certain state
427 * @rgd: the resource group descriptor
428 * @buffer: the buffer that holds the bitmaps
429 * @buflen: the length (in bytes) of the buffer
430 * @state: the state of the block we're looking for
432 * Returns: The number of bits
435 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
436 unsigned int buflen, u8 state)
438 const u8 *byte = buffer;
439 const u8 *end = buffer + buflen;
440 const u8 state1 = state << 2;
441 const u8 state2 = state << 4;
442 const u8 state3 = state << 6;
445 for (; byte < end; byte++) {
446 if (((*byte) & 0x03) == state)
448 if (((*byte) & 0x0C) == state1)
450 if (((*byte) & 0x30) == state2)
452 if (((*byte) & 0xC0) == state3)
460 * gfs2_rgrp_verify - Verify that a resource group is consistent
465 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
467 struct gfs2_sbd *sdp = rgd->rd_sbd;
468 struct gfs2_bitmap *bi = NULL;
469 u32 length = rgd->rd_length;
473 memset(count, 0, 4 * sizeof(u32));
475 /* Count # blocks in each of 4 possible allocation states */
476 for (buf = 0; buf < length; buf++) {
477 bi = rgd->rd_bits + buf;
478 for (x = 0; x < 4; x++)
479 count[x] += gfs2_bitcount(rgd,
485 if (count[0] != rgd->rd_free) {
486 gfs2_lm(sdp, "free data mismatch: %u != %u\n",
487 count[0], rgd->rd_free);
488 gfs2_consist_rgrpd(rgd);
492 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
493 if (count[1] != tmp) {
494 gfs2_lm(sdp, "used data mismatch: %u != %u\n",
496 gfs2_consist_rgrpd(rgd);
500 if (count[2] + count[3] != rgd->rd_dinodes) {
501 gfs2_lm(sdp, "used metadata mismatch: %u != %u\n",
502 count[2] + count[3], rgd->rd_dinodes);
503 gfs2_consist_rgrpd(rgd);
509 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
510 * @sdp: The GFS2 superblock
511 * @blk: The data block number
512 * @exact: True if this needs to be an exact match
514 * The @exact argument should be set to true by most callers. The exception
515 * is when we need to match blocks which are not represented by the rgrp
516 * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
517 * there for alignment purposes. Another way of looking at it is that @exact
518 * matches only valid data/metadata blocks, but with @exact false, it will
519 * match any block within the extent of the rgrp.
521 * Returns: The resource group, or NULL if not found
524 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
526 struct rb_node *n, *next;
527 struct gfs2_rgrpd *cur;
529 spin_lock(&sdp->sd_rindex_spin);
530 n = sdp->sd_rindex_tree.rb_node;
532 cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
534 if (blk < cur->rd_addr)
536 else if (blk >= cur->rd_data0 + cur->rd_data)
539 spin_unlock(&sdp->sd_rindex_spin);
541 if (blk < cur->rd_addr)
543 if (blk >= cur->rd_data0 + cur->rd_data)
550 spin_unlock(&sdp->sd_rindex_spin);
556 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
557 * @sdp: The GFS2 superblock
559 * Returns: The first rgrp in the filesystem
562 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
564 const struct rb_node *n;
565 struct gfs2_rgrpd *rgd;
567 spin_lock(&sdp->sd_rindex_spin);
568 n = rb_first(&sdp->sd_rindex_tree);
569 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
570 spin_unlock(&sdp->sd_rindex_spin);
576 * gfs2_rgrpd_get_next - get the next RG
577 * @rgd: the resource group descriptor
579 * Returns: The next rgrp
582 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
584 struct gfs2_sbd *sdp = rgd->rd_sbd;
585 const struct rb_node *n;
587 spin_lock(&sdp->sd_rindex_spin);
588 n = rb_next(&rgd->rd_node);
590 n = rb_first(&sdp->sd_rindex_tree);
592 if (unlikely(&rgd->rd_node == n)) {
593 spin_unlock(&sdp->sd_rindex_spin);
596 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
597 spin_unlock(&sdp->sd_rindex_spin);
601 void check_and_update_goal(struct gfs2_inode *ip)
603 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
604 if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
605 ip->i_goal = ip->i_no_addr;
608 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
612 for (x = 0; x < rgd->rd_length; x++) {
613 struct gfs2_bitmap *bi = rgd->rd_bits + x;
619 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
620 const char *fs_id_buf)
622 struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
624 gfs2_print_dbg(seq, "%s B: n:%llu s:%llu f:%u\n",
626 (unsigned long long)ip->i_no_addr,
627 (unsigned long long)rs->rs_start,
632 * __rs_deltree - remove a multi-block reservation from the rgd tree
633 * @rs: The reservation to remove
636 static void __rs_deltree(struct gfs2_blkreserv *rs)
638 struct gfs2_rgrpd *rgd;
640 if (!gfs2_rs_active(rs))
644 trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
645 rb_erase(&rs->rs_node, &rgd->rd_rstree);
646 RB_CLEAR_NODE(&rs->rs_node);
648 if (rs->rs_requested) {
649 /* return requested blocks to the rgrp */
650 BUG_ON(rs->rs_rgd->rd_requested < rs->rs_requested);
651 rs->rs_rgd->rd_requested -= rs->rs_requested;
653 /* The rgrp extent failure point is likely not to increase;
654 it will only do so if the freed blocks are somehow
655 contiguous with a span of free blocks that follows. Still,
656 it will force the number to be recalculated later. */
657 rgd->rd_extfail_pt += rs->rs_requested;
658 rs->rs_requested = 0;
663 * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
664 * @rs: The reservation to remove
667 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
669 struct gfs2_rgrpd *rgd;
673 spin_lock(&rgd->rd_rsspin);
675 BUG_ON(rs->rs_requested);
676 spin_unlock(&rgd->rd_rsspin);
681 * gfs2_rs_delete - delete a multi-block reservation
682 * @ip: The inode for this reservation
683 * @wcount: The inode's write count, or NULL
686 void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
688 down_write(&ip->i_rw_mutex);
689 if ((wcount == NULL) || (atomic_read(wcount) <= 1))
690 gfs2_rs_deltree(&ip->i_res);
691 up_write(&ip->i_rw_mutex);
695 * return_all_reservations - return all reserved blocks back to the rgrp.
696 * @rgd: the rgrp that needs its space back
698 * We previously reserved a bunch of blocks for allocation. Now we need to
699 * give them back. This leave the reservation structures in tact, but removes
700 * all of their corresponding "no-fly zones".
702 static void return_all_reservations(struct gfs2_rgrpd *rgd)
705 struct gfs2_blkreserv *rs;
707 spin_lock(&rgd->rd_rsspin);
708 while ((n = rb_first(&rgd->rd_rstree))) {
709 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
712 spin_unlock(&rgd->rd_rsspin);
715 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
718 struct gfs2_rgrpd *rgd;
719 struct gfs2_glock *gl;
721 while ((n = rb_first(&sdp->sd_rindex_tree))) {
722 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
725 rb_erase(n, &sdp->sd_rindex_tree);
728 if (gl->gl_state != LM_ST_UNLOCKED) {
729 gfs2_glock_cb(gl, LM_ST_UNLOCKED);
730 flush_delayed_work(&gl->gl_work);
732 gfs2_rgrp_brelse(rgd);
733 glock_clear_object(gl, rgd);
737 gfs2_free_clones(rgd);
738 return_all_reservations(rgd);
741 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
746 * gfs2_compute_bitstructs - Compute the bitmap sizes
747 * @rgd: The resource group descriptor
749 * Calculates bitmap descriptors, one for each block that contains bitmap data
754 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
756 struct gfs2_sbd *sdp = rgd->rd_sbd;
757 struct gfs2_bitmap *bi;
758 u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
759 u32 bytes_left, bytes;
765 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
769 bytes_left = rgd->rd_bitbytes;
771 for (x = 0; x < length; x++) {
772 bi = rgd->rd_bits + x;
775 /* small rgrp; bitmap stored completely in header block */
778 bi->bi_offset = sizeof(struct gfs2_rgrp);
780 bi->bi_bytes = bytes;
781 bi->bi_blocks = bytes * GFS2_NBBY;
784 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
785 bi->bi_offset = sizeof(struct gfs2_rgrp);
787 bi->bi_bytes = bytes;
788 bi->bi_blocks = bytes * GFS2_NBBY;
790 } else if (x + 1 == length) {
792 bi->bi_offset = sizeof(struct gfs2_meta_header);
793 bi->bi_start = rgd->rd_bitbytes - bytes_left;
794 bi->bi_bytes = bytes;
795 bi->bi_blocks = bytes * GFS2_NBBY;
798 bytes = sdp->sd_sb.sb_bsize -
799 sizeof(struct gfs2_meta_header);
800 bi->bi_offset = sizeof(struct gfs2_meta_header);
801 bi->bi_start = rgd->rd_bitbytes - bytes_left;
802 bi->bi_bytes = bytes;
803 bi->bi_blocks = bytes * GFS2_NBBY;
810 gfs2_consist_rgrpd(rgd);
813 bi = rgd->rd_bits + (length - 1);
814 if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
821 "start=%u len=%u offset=%u\n",
822 (unsigned long long)rgd->rd_addr,
824 (unsigned long long)rgd->rd_data0,
827 bi->bi_start, bi->bi_bytes, bi->bi_offset);
828 gfs2_consist_rgrpd(rgd);
836 * gfs2_ri_total - Total up the file system space, according to the rindex.
837 * @sdp: the filesystem
840 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
843 struct inode *inode = sdp->sd_rindex;
844 struct gfs2_inode *ip = GFS2_I(inode);
845 char buf[sizeof(struct gfs2_rindex)];
848 for (rgrps = 0;; rgrps++) {
849 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
851 if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
853 error = gfs2_internal_read(ip, buf, &pos,
854 sizeof(struct gfs2_rindex));
855 if (error != sizeof(struct gfs2_rindex))
857 total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
862 static int rgd_insert(struct gfs2_rgrpd *rgd)
864 struct gfs2_sbd *sdp = rgd->rd_sbd;
865 struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
867 /* Figure out where to put new node */
869 struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
873 if (rgd->rd_addr < cur->rd_addr)
874 newn = &((*newn)->rb_left);
875 else if (rgd->rd_addr > cur->rd_addr)
876 newn = &((*newn)->rb_right);
881 rb_link_node(&rgd->rd_node, parent, newn);
882 rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
888 * read_rindex_entry - Pull in a new resource index entry from the disk
889 * @ip: Pointer to the rindex inode
891 * Returns: 0 on success, > 0 on EOF, error code otherwise
894 static int read_rindex_entry(struct gfs2_inode *ip)
896 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
897 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
898 struct gfs2_rindex buf;
900 struct gfs2_rgrpd *rgd;
902 if (pos >= i_size_read(&ip->i_inode))
905 error = gfs2_internal_read(ip, (char *)&buf, &pos,
906 sizeof(struct gfs2_rindex));
908 if (error != sizeof(struct gfs2_rindex))
909 return (error == 0) ? 1 : error;
911 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
917 rgd->rd_addr = be64_to_cpu(buf.ri_addr);
918 rgd->rd_length = be32_to_cpu(buf.ri_length);
919 rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
920 rgd->rd_data = be32_to_cpu(buf.ri_data);
921 rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
922 spin_lock_init(&rgd->rd_rsspin);
924 error = compute_bitstructs(rgd);
928 error = gfs2_glock_get(sdp, rgd->rd_addr,
929 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
933 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
934 rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
935 if (rgd->rd_data > sdp->sd_max_rg_data)
936 sdp->sd_max_rg_data = rgd->rd_data;
937 spin_lock(&sdp->sd_rindex_spin);
938 error = rgd_insert(rgd);
939 spin_unlock(&sdp->sd_rindex_spin);
941 glock_set_object(rgd->rd_gl, rgd);
945 error = 0; /* someone else read in the rgrp; free it and ignore it */
946 gfs2_glock_put(rgd->rd_gl);
951 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
956 * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
957 * @sdp: the GFS2 superblock
959 * The purpose of this function is to select a subset of the resource groups
960 * and mark them as PREFERRED. We do it in such a way that each node prefers
961 * to use a unique set of rgrps to minimize glock contention.
963 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
965 struct gfs2_rgrpd *rgd, *first;
968 /* Skip an initial number of rgrps, based on this node's journal ID.
969 That should start each node out on its own set. */
970 rgd = gfs2_rgrpd_get_first(sdp);
971 for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
972 rgd = gfs2_rgrpd_get_next(rgd);
976 rgd->rd_flags |= GFS2_RDF_PREFERRED;
977 for (i = 0; i < sdp->sd_journals; i++) {
978 rgd = gfs2_rgrpd_get_next(rgd);
979 if (!rgd || rgd == first)
982 } while (rgd && rgd != first);
986 * gfs2_ri_update - Pull in a new resource index from the disk
987 * @ip: pointer to the rindex inode
989 * Returns: 0 on successful update, error code otherwise
992 static int gfs2_ri_update(struct gfs2_inode *ip)
994 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
998 error = read_rindex_entry(ip);
999 } while (error == 0);
1004 if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
1005 fs_err(sdp, "no resource groups found in the file system.\n");
1008 set_rgrp_preferences(sdp);
1010 sdp->sd_rindex_uptodate = 1;
1015 * gfs2_rindex_update - Update the rindex if required
1016 * @sdp: The GFS2 superblock
1018 * We grab a lock on the rindex inode to make sure that it doesn't
1019 * change whilst we are performing an operation. We keep this lock
1020 * for quite long periods of time compared to other locks. This
1021 * doesn't matter, since it is shared and it is very, very rarely
1022 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1024 * This makes sure that we're using the latest copy of the resource index
1025 * special file, which might have been updated if someone expanded the
1026 * filesystem (via gfs2_grow utility), which adds new resource groups.
1028 * Returns: 0 on succeess, error code otherwise
1031 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1033 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1034 struct gfs2_glock *gl = ip->i_gl;
1035 struct gfs2_holder ri_gh;
1037 int unlock_required = 0;
1039 /* Read new copy from disk if we don't have the latest */
1040 if (!sdp->sd_rindex_uptodate) {
1041 if (!gfs2_glock_is_locked_by_me(gl)) {
1042 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1045 unlock_required = 1;
1047 if (!sdp->sd_rindex_uptodate)
1048 error = gfs2_ri_update(ip);
1049 if (unlock_required)
1050 gfs2_glock_dq_uninit(&ri_gh);
1056 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1058 const struct gfs2_rgrp *str = buf;
1061 rg_flags = be32_to_cpu(str->rg_flags);
1062 rg_flags &= ~GFS2_RDF_MASK;
1063 rgd->rd_flags &= GFS2_RDF_MASK;
1064 rgd->rd_flags |= rg_flags;
1065 rgd->rd_free = be32_to_cpu(str->rg_free);
1066 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1067 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1068 /* rd_data0, rd_data and rd_bitbytes already set from rindex */
1071 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1073 const struct gfs2_rgrp *str = buf;
1075 rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1076 rgl->rl_flags = str->rg_flags;
1077 rgl->rl_free = str->rg_free;
1078 rgl->rl_dinodes = str->rg_dinodes;
1079 rgl->rl_igeneration = str->rg_igeneration;
1083 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1085 struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1086 struct gfs2_rgrp *str = buf;
1089 str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1090 str->rg_free = cpu_to_be32(rgd->rd_free);
1091 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1094 else if (next->rd_addr > rgd->rd_addr)
1095 str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1096 str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1097 str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1098 str->rg_data = cpu_to_be32(rgd->rd_data);
1099 str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1101 crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1102 str->rg_crc = cpu_to_be32(crc);
1104 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1105 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1108 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1110 struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1111 struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1112 struct gfs2_sbd *sdp = rgd->rd_sbd;
1115 if (rgl->rl_flags != str->rg_flags) {
1116 fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1117 (unsigned long long)rgd->rd_addr,
1118 be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1121 if (rgl->rl_free != str->rg_free) {
1122 fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1123 (unsigned long long)rgd->rd_addr,
1124 be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1127 if (rgl->rl_dinodes != str->rg_dinodes) {
1128 fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1129 (unsigned long long)rgd->rd_addr,
1130 be32_to_cpu(rgl->rl_dinodes),
1131 be32_to_cpu(str->rg_dinodes));
1134 if (rgl->rl_igeneration != str->rg_igeneration) {
1135 fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1136 (unsigned long long)rgd->rd_addr,
1137 (unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1138 (unsigned long long)be64_to_cpu(str->rg_igeneration));
1144 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1146 struct gfs2_bitmap *bi;
1147 const u32 length = rgd->rd_length;
1148 const u8 *buffer = NULL;
1149 u32 i, goal, count = 0;
1151 for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1153 buffer = bi->bi_bh->b_data + bi->bi_offset;
1154 WARN_ON(!buffer_uptodate(bi->bi_bh));
1155 while (goal < bi->bi_blocks) {
1156 goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1157 GFS2_BLKST_UNLINKED);
1158 if (goal == BFITNOENT)
1168 static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
1170 struct gfs2_bitmap *bi;
1174 for (x = 0; x < rgd->rd_length; x++) {
1175 bi = rgd->rd_bits + x;
1176 clear_bit(GBF_FULL, &bi->bi_flags);
1179 for (x = 0; x < rgd->rd_length; x++) {
1180 bi = rgd->rd_bits + x;
1181 set_bit(GBF_FULL, &bi->bi_flags);
1187 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1188 * @rgd: the struct gfs2_rgrpd describing the RG to read in
1190 * Read in all of a Resource Group's header and bitmap blocks.
1191 * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1196 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1198 struct gfs2_sbd *sdp = rgd->rd_sbd;
1199 struct gfs2_glock *gl = rgd->rd_gl;
1200 unsigned int length = rgd->rd_length;
1201 struct gfs2_bitmap *bi;
1205 if (rgd->rd_bits[0].bi_bh != NULL)
1208 for (x = 0; x < length; x++) {
1209 bi = rgd->rd_bits + x;
1210 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1215 for (y = length; y--;) {
1216 bi = rgd->rd_bits + y;
1217 error = gfs2_meta_wait(sdp, bi->bi_bh);
1220 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1221 GFS2_METATYPE_RG)) {
1227 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1228 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1229 rgrp_set_bitmap_flags(rgd);
1230 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1231 rgd->rd_free_clone = rgd->rd_free;
1232 BUG_ON(rgd->rd_reserved);
1233 /* max out the rgrp allocation failure point */
1234 rgd->rd_extfail_pt = rgd->rd_free;
1236 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1237 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1238 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1239 rgd->rd_bits[0].bi_bh->b_data);
1241 else if (sdp->sd_args.ar_rgrplvb) {
1242 if (!gfs2_rgrp_lvb_valid(rgd)){
1243 gfs2_consist_rgrpd(rgd);
1247 if (rgd->rd_rgl->rl_unlinked == 0)
1248 rgd->rd_flags &= ~GFS2_RDF_CHECK;
1254 bi = rgd->rd_bits + x;
1257 gfs2_assert_warn(sdp, !bi->bi_clone);
1263 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1267 if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1270 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1271 return gfs2_rgrp_bh_get(rgd);
1273 rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1274 rl_flags &= ~GFS2_RDF_MASK;
1275 rgd->rd_flags &= GFS2_RDF_MASK;
1276 rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1277 if (rgd->rd_rgl->rl_unlinked == 0)
1278 rgd->rd_flags &= ~GFS2_RDF_CHECK;
1279 rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1280 rgrp_set_bitmap_flags(rgd);
1281 rgd->rd_free_clone = rgd->rd_free;
1282 BUG_ON(rgd->rd_reserved);
1283 /* max out the rgrp allocation failure point */
1284 rgd->rd_extfail_pt = rgd->rd_free;
1285 rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1286 rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1290 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1292 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1293 struct gfs2_sbd *sdp = rgd->rd_sbd;
1295 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1297 return gfs2_rgrp_bh_get(rgd);
1301 * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1302 * @rgd: The resource group
1306 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1308 int x, length = rgd->rd_length;
1310 for (x = 0; x < length; x++) {
1311 struct gfs2_bitmap *bi = rgd->rd_bits + x;
1319 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1320 struct buffer_head *bh,
1321 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1323 struct super_block *sb = sdp->sd_vfs;
1326 sector_t nr_blks = 0;
1332 for (x = 0; x < bi->bi_bytes; x++) {
1333 const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1334 clone += bi->bi_offset;
1337 const u8 *orig = bh->b_data + bi->bi_offset + x;
1338 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1340 diff = ~(*clone | (*clone >> 1));
1345 blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1349 goto start_new_extent;
1350 if ((start + nr_blks) != blk) {
1351 if (nr_blks >= minlen) {
1352 rv = sb_issue_discard(sb,
1369 if (nr_blks >= minlen) {
1370 rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1376 *ptrimmed = trimmed;
1380 if (sdp->sd_args.ar_discard)
1381 fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1382 sdp->sd_args.ar_discard = 0;
1387 * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1388 * @filp: Any file on the filesystem
1389 * @argp: Pointer to the arguments (also used to pass result)
1391 * Returns: 0 on success, otherwise error code
1394 int gfs2_fitrim(struct file *filp, void __user *argp)
1396 struct inode *inode = file_inode(filp);
1397 struct gfs2_sbd *sdp = GFS2_SB(inode);
1398 struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1399 struct buffer_head *bh;
1400 struct gfs2_rgrpd *rgd;
1401 struct gfs2_rgrpd *rgd_end;
1402 struct gfs2_holder gh;
1403 struct fstrim_range r;
1407 u64 start, end, minlen;
1409 unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1411 if (!capable(CAP_SYS_ADMIN))
1414 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1417 if (!blk_queue_discard(q))
1420 if (copy_from_user(&r, argp, sizeof(r)))
1423 ret = gfs2_rindex_update(sdp);
1427 start = r.start >> bs_shift;
1428 end = start + (r.len >> bs_shift);
1429 minlen = max_t(u64, r.minlen,
1430 q->limits.discard_granularity) >> bs_shift;
1432 if (end <= start || minlen > sdp->sd_max_rg_data)
1435 rgd = gfs2_blk2rgrpd(sdp, start, 0);
1436 rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1438 if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1439 && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1440 return -EINVAL; /* start is beyond the end of the fs */
1444 ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1448 if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1449 /* Trim each bitmap in the rgrp */
1450 for (x = 0; x < rgd->rd_length; x++) {
1451 struct gfs2_bitmap *bi = rgd->rd_bits + x;
1452 ret = gfs2_rgrp_send_discards(sdp,
1453 rgd->rd_data0, NULL, bi, minlen,
1456 gfs2_glock_dq_uninit(&gh);
1462 /* Mark rgrp as having been trimmed */
1463 ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1465 bh = rgd->rd_bits[0].bi_bh;
1466 rgd->rd_flags |= GFS2_RGF_TRIMMED;
1467 gfs2_trans_add_meta(rgd->rd_gl, bh);
1468 gfs2_rgrp_out(rgd, bh->b_data);
1469 gfs2_trans_end(sdp);
1472 gfs2_glock_dq_uninit(&gh);
1477 rgd = gfs2_rgrpd_get_next(rgd);
1481 r.len = trimmed << bs_shift;
1482 if (copy_to_user(argp, &r, sizeof(r)))
1489 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1490 * @ip: the inode structure
1493 static void rs_insert(struct gfs2_inode *ip)
1495 struct rb_node **newn, *parent = NULL;
1497 struct gfs2_blkreserv *rs = &ip->i_res;
1498 struct gfs2_rgrpd *rgd = rs->rs_rgd;
1500 BUG_ON(gfs2_rs_active(rs));
1502 spin_lock(&rgd->rd_rsspin);
1503 newn = &rgd->rd_rstree.rb_node;
1505 struct gfs2_blkreserv *cur =
1506 rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1509 rc = rs_cmp(rs->rs_start, rs->rs_requested, cur);
1511 newn = &((*newn)->rb_right);
1513 newn = &((*newn)->rb_left);
1515 spin_unlock(&rgd->rd_rsspin);
1521 rb_link_node(&rs->rs_node, parent, newn);
1522 rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1524 /* Do our rgrp accounting for the reservation */
1525 rgd->rd_requested += rs->rs_requested; /* blocks requested */
1526 spin_unlock(&rgd->rd_rsspin);
1527 trace_gfs2_rs(rs, TRACE_RS_INSERT);
1531 * rgd_free - return the number of free blocks we can allocate.
1532 * @rgd: the resource group
1534 * This function returns the number of free blocks for an rgrp.
1535 * That's the clone-free blocks (blocks that are free, not including those
1536 * still being used for unlinked files that haven't been deleted.)
1538 * It also subtracts any blocks reserved by someone else, but does not
1539 * include free blocks that are still part of our current reservation,
1540 * because obviously we can (and will) allocate them.
1542 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1544 u32 tot_reserved, tot_free;
1546 if (WARN_ON_ONCE(rgd->rd_requested < rs->rs_requested))
1548 tot_reserved = rgd->rd_requested - rs->rs_requested;
1550 if (rgd->rd_free_clone < tot_reserved)
1553 tot_free = rgd->rd_free_clone - tot_reserved;
1559 * rg_mblk_search - find a group of multiple free blocks to form a reservation
1560 * @rgd: the resource group descriptor
1561 * @ip: pointer to the inode for which we're reserving blocks
1562 * @ap: the allocation parameters
1566 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1567 const struct gfs2_alloc_parms *ap)
1569 struct gfs2_rbm rbm = { .rgd = rgd, };
1571 struct gfs2_blkreserv *rs = &ip->i_res;
1573 u32 free_blocks, blocks_available;
1575 struct inode *inode = &ip->i_inode;
1577 spin_lock(&rgd->rd_rsspin);
1578 free_blocks = rgd_free(rgd, rs);
1579 if (rgd->rd_free_clone < rgd->rd_requested)
1581 blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
1582 if (rgd == rs->rs_rgd)
1583 blocks_available += rs->rs_reserved;
1584 spin_unlock(&rgd->rd_rsspin);
1586 if (S_ISDIR(inode->i_mode))
1589 extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1590 extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1592 if (free_blocks < extlen || blocks_available < extlen)
1595 /* Find bitmap block that contains bits for goal block */
1596 if (rgrp_contains_block(rgd, ip->i_goal))
1599 goal = rgd->rd_last_alloc + rgd->rd_data0;
1601 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1604 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, &ip->i_res, true);
1606 rs->rs_start = gfs2_rbm_to_block(&rbm);
1607 rs->rs_requested = extlen;
1610 if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1611 rgd->rd_last_alloc = 0;
1616 * gfs2_next_unreserved_block - Return next block that is not reserved
1617 * @rgd: The resource group
1618 * @block: The starting block
1619 * @length: The required length
1620 * @ignore_rs: Reservation to ignore
1622 * If the block does not appear in any reservation, then return the
1623 * block number unchanged. If it does appear in the reservation, then
1624 * keep looking through the tree of reservations in order to find the
1625 * first block number which is not reserved.
1628 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1630 struct gfs2_blkreserv *ignore_rs)
1632 struct gfs2_blkreserv *rs;
1636 spin_lock(&rgd->rd_rsspin);
1637 n = rgd->rd_rstree.rb_node;
1639 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1640 rc = rs_cmp(block, length, rs);
1650 while (rs_cmp(block, length, rs) == 0 && rs != ignore_rs) {
1651 block = rs->rs_start + rs->rs_requested;
1655 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1659 spin_unlock(&rgd->rd_rsspin);
1664 * gfs2_reservation_check_and_update - Check for reservations during block alloc
1665 * @rbm: The current position in the resource group
1666 * @rs: Our own reservation
1667 * @minext: The minimum extent length
1668 * @maxext: A pointer to the maximum extent structure
1670 * This checks the current position in the rgrp to see whether there is
1671 * a reservation covering this block. If not then this function is a
1672 * no-op. If there is, then the position is moved to the end of the
1673 * contiguous reservation(s) so that we are pointing at the first
1674 * non-reserved block.
1676 * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1679 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1680 struct gfs2_blkreserv *rs,
1682 struct gfs2_extent *maxext)
1684 u64 block = gfs2_rbm_to_block(rbm);
1689 * If we have a minimum extent length, then skip over any extent
1690 * which is less than the min extent length in size.
1693 extlen = gfs2_free_extlen(rbm, minext);
1694 if (extlen <= maxext->len)
1699 * Check the extent which has been found against the reservations
1700 * and skip if parts of it are already reserved
1702 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, rs);
1703 if (nblock == block) {
1704 if (!minext || extlen >= minext)
1707 if (extlen > maxext->len) {
1708 maxext->len = extlen;
1712 u64 len = nblock - block;
1713 if (len >= (u64)1 << 32)
1718 if (gfs2_rbm_add(rbm, extlen))
1724 * gfs2_rbm_find - Look for blocks of a particular state
1725 * @rbm: Value/result starting position and final position
1726 * @state: The state which we want to find
1727 * @minext: Pointer to the requested extent length
1728 * This is updated to be the actual reservation size.
1729 * @rs: Our own reservation (NULL to skip checking for reservations)
1730 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1731 * around until we've reached the starting point.
1734 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1735 * has no free blocks in it.
1736 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1737 * has come up short on a free block search.
1739 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1742 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1743 struct gfs2_blkreserv *rs, bool nowrap)
1745 bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1746 struct buffer_head *bh;
1750 bool wrapped = false;
1752 struct gfs2_bitmap *bi;
1753 struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1756 * Determine the last bitmap to search. If we're not starting at the
1757 * beginning of a bitmap, we need to search that bitmap twice to scan
1758 * the entire resource group.
1760 last_bii = rbm->bii - (rbm->offset == 0);
1764 if (test_bit(GBF_FULL, &bi->bi_flags) &&
1765 (state == GFS2_BLKST_FREE))
1769 buffer = bh->b_data + bi->bi_offset;
1770 WARN_ON(!buffer_uptodate(bh));
1771 if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1772 buffer = bi->bi_clone + bi->bi_offset;
1773 offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1774 if (offset == BFITNOENT) {
1775 if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1776 set_bit(GBF_FULL, &bi->bi_flags);
1779 rbm->offset = offset;
1783 ret = gfs2_reservation_check_and_update(rbm, rs, *minext,
1789 if (ret == -E2BIG) {
1792 goto res_covered_end_of_rgrp;
1796 next_bitmap: /* Find next bitmap in the rgrp */
1799 if (rbm->bii == rbm->rgd->rd_length)
1801 res_covered_end_of_rgrp:
1802 if (rbm->bii == 0) {
1810 /* Have we scanned the entire resource group? */
1811 if (wrapped && rbm->bii > last_bii)
1815 if (state != GFS2_BLKST_FREE)
1818 /* If the extent was too small, and it's smaller than the smallest
1819 to have failed before, remember for future reference that it's
1820 useless to search this rgrp again for this amount or more. */
1821 if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1822 *minext < rbm->rgd->rd_extfail_pt)
1823 rbm->rgd->rd_extfail_pt = *minext - 1;
1825 /* If the maximum extent we found is big enough to fulfill the
1826 minimum requirements, use it anyway. */
1829 *minext = maxext.len;
1837 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1839 * @last_unlinked: block address of the last dinode we unlinked
1840 * @skip: block address we should explicitly not unlink
1842 * Returns: 0 if no error
1843 * The inode, if one has been found, in inode.
1846 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1849 struct gfs2_sbd *sdp = rgd->rd_sbd;
1850 struct gfs2_glock *gl;
1851 struct gfs2_inode *ip;
1854 struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1857 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1859 if (error == -ENOSPC)
1861 if (WARN_ON_ONCE(error))
1864 block = gfs2_rbm_to_block(&rbm);
1865 if (gfs2_rbm_from_block(&rbm, block + 1))
1867 if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1871 *last_unlinked = block;
1873 error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1877 /* If the inode is already in cache, we can ignore it here
1878 * because the existing inode disposal code will deal with
1879 * it when all refs have gone away. Accessing gl_object like
1880 * this is not safe in general. Here it is ok because we do
1881 * not dereference the pointer, and we only need an approx
1882 * answer to whether it is NULL or not.
1886 if (ip || !gfs2_queue_delete_work(gl, 0))
1891 /* Limit reclaim to sensible number of tasks */
1892 if (found > NR_CPUS)
1896 rgd->rd_flags &= ~GFS2_RDF_CHECK;
1901 * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1902 * @rgd: The rgrp in question
1903 * @loops: An indication of how picky we can be (0=very, 1=less so)
1905 * This function uses the recently added glock statistics in order to
1906 * figure out whether a parciular resource group is suffering from
1907 * contention from multiple nodes. This is done purely on the basis
1908 * of timings, since this is the only data we have to work with and
1909 * our aim here is to reject a resource group which is highly contended
1910 * but (very important) not to do this too often in order to ensure that
1911 * we do not land up introducing fragmentation by changing resource
1912 * groups when not actually required.
1914 * The calculation is fairly simple, we want to know whether the SRTTB
1915 * (i.e. smoothed round trip time for blocking operations) to acquire
1916 * the lock for this rgrp's glock is significantly greater than the
1917 * time taken for resource groups on average. We introduce a margin in
1918 * the form of the variable @var which is computed as the sum of the two
1919 * respective variences, and multiplied by a factor depending on @loops
1920 * and whether we have a lot of data to base the decision on. This is
1921 * then tested against the square difference of the means in order to
1922 * decide whether the result is statistically significant or not.
1924 * Returns: A boolean verdict on the congestion status
1927 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1929 const struct gfs2_glock *gl = rgd->rd_gl;
1930 const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1931 struct gfs2_lkstats *st;
1932 u64 r_dcount, l_dcount;
1933 u64 l_srttb, a_srttb = 0;
1937 int cpu, nonzero = 0;
1940 for_each_present_cpu(cpu) {
1941 st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1942 if (st->stats[GFS2_LKS_SRTTB]) {
1943 a_srttb += st->stats[GFS2_LKS_SRTTB];
1947 st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1949 do_div(a_srttb, nonzero);
1950 r_dcount = st->stats[GFS2_LKS_DCOUNT];
1951 var = st->stats[GFS2_LKS_SRTTVARB] +
1952 gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1955 l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1956 l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1958 if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1961 srttb_diff = a_srttb - l_srttb;
1962 sqr_diff = srttb_diff * srttb_diff;
1965 if (l_dcount < 8 || r_dcount < 8)
1970 return ((srttb_diff < 0) && (sqr_diff > var));
1974 * gfs2_rgrp_used_recently
1975 * @rs: The block reservation with the rgrp to test
1976 * @msecs: The time limit in milliseconds
1978 * Returns: True if the rgrp glock has been used within the time limit
1980 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1985 tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1986 rs->rs_rgd->rd_gl->gl_dstamp));
1988 return tdiff > (msecs * 1000 * 1000);
1991 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1993 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1996 get_random_bytes(&skip, sizeof(skip));
1997 return skip % sdp->sd_rgrps;
2000 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
2002 struct gfs2_rgrpd *rgd = *pos;
2003 struct gfs2_sbd *sdp = rgd->rd_sbd;
2005 rgd = gfs2_rgrpd_get_next(rgd);
2007 rgd = gfs2_rgrpd_get_first(sdp);
2009 if (rgd != begin) /* If we didn't wrap */
2015 * fast_to_acquire - determine if a resource group will be fast to acquire
2017 * If this is one of our preferred rgrps, it should be quicker to acquire,
2018 * because we tried to set ourselves up as dlm lock master.
2020 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2022 struct gfs2_glock *gl = rgd->rd_gl;
2024 if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2025 !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2026 !test_bit(GLF_DEMOTE, &gl->gl_flags))
2028 if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2034 * gfs2_inplace_reserve - Reserve space in the filesystem
2035 * @ip: the inode to reserve space for
2036 * @ap: the allocation parameters
2038 * We try our best to find an rgrp that has at least ap->target blocks
2039 * available. After a couple of passes (loops == 2), the prospects of finding
2040 * such an rgrp diminish. At this stage, we return the first rgrp that has
2041 * at least ap->min_target blocks available.
2043 * Returns: 0 on success,
2044 * -ENOMEM if a suitable rgrp can't be found
2048 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2050 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2051 struct gfs2_rgrpd *begin = NULL;
2052 struct gfs2_blkreserv *rs = &ip->i_res;
2053 int error = 0, rg_locked, flags = 0;
2054 u64 last_unlinked = NO_BLOCK;
2055 u32 target = ap->target;
2057 u32 free_blocks, blocks_available, skip = 0;
2059 BUG_ON(rs->rs_reserved);
2061 if (sdp->sd_args.ar_rgrplvb)
2063 if (gfs2_assert_warn(sdp, target))
2065 if (gfs2_rs_active(rs)) {
2067 } else if (rs->rs_rgd &&
2068 rgrp_contains_block(rs->rs_rgd, ip->i_goal)) {
2071 check_and_update_goal(ip);
2072 rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2074 if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2075 skip = gfs2_orlov_skip(ip);
2076 if (rs->rs_rgd == NULL)
2080 struct gfs2_rgrpd *rgd;
2084 if (!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
2088 if (!gfs2_rs_active(rs)) {
2090 !fast_to_acquire(rs->rs_rgd))
2093 gfs2_rgrp_used_recently(rs, 1000) &&
2094 gfs2_rgrp_congested(rs->rs_rgd, loops))
2097 error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
2098 LM_ST_EXCLUSIVE, flags,
2100 if (unlikely(error))
2102 if (!gfs2_rs_active(rs) && (loops < 2) &&
2103 gfs2_rgrp_congested(rs->rs_rgd, loops))
2105 if (sdp->sd_args.ar_rgrplvb) {
2106 error = update_rgrp_lvb(rs->rs_rgd);
2107 if (unlikely(error)) {
2108 gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2114 /* Skip unusable resource groups */
2115 if ((rs->rs_rgd->rd_flags & (GFS2_RGF_NOALLOC |
2117 (loops == 0 && target > rs->rs_rgd->rd_extfail_pt))
2120 if (sdp->sd_args.ar_rgrplvb)
2121 gfs2_rgrp_bh_get(rs->rs_rgd);
2123 /* Get a reservation if we don't already have one */
2124 if (!gfs2_rs_active(rs))
2125 rg_mblk_search(rs->rs_rgd, ip, ap);
2127 /* Skip rgrps when we can't get a reservation on first pass */
2128 if (!gfs2_rs_active(rs) && (loops < 1))
2131 /* If rgrp has enough free space, use it */
2133 spin_lock(&rgd->rd_rsspin);
2134 free_blocks = rgd_free(rgd, rs);
2135 blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
2136 if (free_blocks < target || blocks_available < target) {
2137 spin_unlock(&rgd->rd_rsspin);
2140 rs->rs_reserved = ap->target;
2141 if (rs->rs_reserved > blocks_available)
2142 rs->rs_reserved = blocks_available;
2143 rgd->rd_reserved += rs->rs_reserved;
2144 spin_unlock(&rgd->rd_rsspin);
2147 /* Check for unlinked inodes which can be reclaimed */
2148 if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK)
2149 try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
2152 /* Drop reservation, if we couldn't use reserved rgrp */
2153 if (gfs2_rs_active(rs))
2154 gfs2_rs_deltree(rs);
2156 /* Unlock rgrp if required */
2158 gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2160 /* Find the next rgrp, and continue looking */
2161 if (gfs2_select_rgrp(&rs->rs_rgd, begin))
2166 /* If we've scanned all the rgrps, but found no free blocks
2167 * then this checks for some less likely conditions before
2171 /* Check that fs hasn't grown if writing to rindex */
2172 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2173 error = gfs2_ri_update(ip);
2177 /* Flushing the log may release space */
2180 target = ap->min_target;
2181 gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2182 GFS2_LFC_INPLACE_RESERVE);
2190 * gfs2_inplace_release - release an inplace reservation
2191 * @ip: the inode the reservation was taken out on
2193 * Release a reservation made by gfs2_inplace_reserve().
2196 void gfs2_inplace_release(struct gfs2_inode *ip)
2198 struct gfs2_blkreserv *rs = &ip->i_res;
2200 if (rs->rs_reserved) {
2201 struct gfs2_rgrpd *rgd = rs->rs_rgd;
2203 spin_lock(&rgd->rd_rsspin);
2204 BUG_ON(rgd->rd_reserved < rs->rs_reserved);
2205 rgd->rd_reserved -= rs->rs_reserved;
2206 spin_unlock(&rgd->rd_rsspin);
2207 rs->rs_reserved = 0;
2209 if (gfs2_holder_initialized(&ip->i_rgd_gh))
2210 gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2214 * gfs2_alloc_extent - allocate an extent from a given bitmap
2215 * @rbm: the resource group information
2216 * @dinode: TRUE if the first block we allocate is for a dinode
2217 * @n: The extent length (value/result)
2219 * Add the bitmap buffer to the transaction.
2220 * Set the found bits to @new_state to change block's allocation state.
2222 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2225 struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2226 const unsigned int elen = *n;
2231 block = gfs2_rbm_to_block(rbm);
2232 gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2233 gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2236 ret = gfs2_rbm_from_block(&pos, block);
2237 if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2239 gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2240 gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2247 * rgblk_free - Change alloc state of given block(s)
2248 * @sdp: the filesystem
2249 * @rgd: the resource group the blocks are in
2250 * @bstart: the start of a run of blocks to free
2251 * @blen: the length of the block run (all must lie within ONE RG!)
2252 * @new_state: GFS2_BLKST_XXX the after-allocation block state
2255 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2256 u64 bstart, u32 blen, unsigned char new_state)
2258 struct gfs2_rbm rbm;
2259 struct gfs2_bitmap *bi, *bi_prev = NULL;
2262 if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2266 if (bi != bi_prev) {
2267 if (!bi->bi_clone) {
2268 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2269 GFP_NOFS | __GFP_NOFAIL);
2270 memcpy(bi->bi_clone + bi->bi_offset,
2271 bi->bi_bh->b_data + bi->bi_offset,
2274 gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2277 gfs2_setbit(&rbm, false, new_state);
2278 gfs2_rbm_add(&rbm, 1);
2283 * gfs2_rgrp_dump - print out an rgrp
2284 * @seq: The iterator
2285 * @rgd: The rgrp in question
2286 * @fs_id_buf: pointer to file system id (if requested)
2290 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2291 const char *fs_id_buf)
2293 struct gfs2_blkreserv *trs;
2294 const struct rb_node *n;
2296 gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u q:%u r:%u e:%u\n",
2298 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2299 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2300 rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt);
2301 if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2302 struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2304 gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf,
2305 be32_to_cpu(rgl->rl_flags),
2306 be32_to_cpu(rgl->rl_free),
2307 be32_to_cpu(rgl->rl_dinodes));
2309 spin_lock(&rgd->rd_rsspin);
2310 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2311 trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2312 dump_rs(seq, trs, fs_id_buf);
2314 spin_unlock(&rgd->rd_rsspin);
2317 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2319 struct gfs2_sbd *sdp = rgd->rd_sbd;
2320 char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2322 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2323 (unsigned long long)rgd->rd_addr);
2324 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2325 sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2326 gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2327 rgd->rd_flags |= GFS2_RDF_ERROR;
2331 * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2332 * @ip: The inode we have just allocated blocks for
2333 * @rbm: The start of the allocated blocks
2334 * @len: The extent length
2336 * Adjusts a reservation after an allocation has taken place. If the
2337 * reservation does not match the allocation, or if it is now empty
2338 * then it is removed.
2341 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2342 const struct gfs2_rbm *rbm, unsigned len)
2344 struct gfs2_blkreserv *rs = &ip->i_res;
2345 struct gfs2_rgrpd *rgd = rbm->rgd;
2347 BUG_ON(rs->rs_reserved < len);
2348 rs->rs_reserved -= len;
2349 if (gfs2_rs_active(rs)) {
2350 u64 start = gfs2_rbm_to_block(rbm);
2352 if (rs->rs_start == start) {
2355 rs->rs_start += len;
2356 rlen = min(rs->rs_requested, len);
2357 rs->rs_requested -= rlen;
2358 rgd->rd_requested -= rlen;
2359 trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2360 if (rs->rs_start < rgd->rd_data0 + rgd->rd_data &&
2363 /* We used up our block reservation, so we should
2364 reserve more blocks next time. */
2365 atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2372 * gfs2_set_alloc_start - Set starting point for block allocation
2373 * @rbm: The rbm which will be set to the required location
2374 * @ip: The gfs2 inode
2375 * @dinode: Flag to say if allocation includes a new inode
2377 * This sets the starting point from the reservation if one is active
2378 * otherwise it falls back to guessing a start point based on the
2379 * inode's goal block or the last allocation point in the rgrp.
2382 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2383 const struct gfs2_inode *ip, bool dinode)
2387 if (gfs2_rs_active(&ip->i_res)) {
2388 goal = ip->i_res.rs_start;
2390 if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2393 goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2395 if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2402 * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2403 * @ip: the inode to allocate the block for
2404 * @bn: Used to return the starting block number
2405 * @nblocks: requested number of blocks/extent length (value/result)
2406 * @dinode: 1 if we're allocating a dinode block, else 0
2407 * @generation: the generation number of the inode
2409 * Returns: 0 or error
2412 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2413 bool dinode, u64 *generation)
2415 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2416 struct buffer_head *dibh;
2417 struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rgd, };
2418 u64 block; /* block, within the file system scope */
2420 int error = -ENOSPC;
2422 BUG_ON(ip->i_res.rs_reserved < *nblocks);
2424 if (gfs2_rs_active(&ip->i_res)) {
2425 gfs2_set_alloc_start(&rbm, ip, dinode);
2426 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, &ip->i_res, false);
2428 if (error == -ENOSPC) {
2429 gfs2_set_alloc_start(&rbm, ip, dinode);
2430 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false);
2433 /* Since all blocks are reserved in advance, this shouldn't happen */
2435 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2436 (unsigned long long)ip->i_no_addr, error, *nblocks,
2437 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2438 rbm.rgd->rd_extfail_pt);
2442 gfs2_alloc_extent(&rbm, dinode, nblocks);
2443 block = gfs2_rbm_to_block(&rbm);
2444 rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2446 ip->i_goal = block + *nblocks - 1;
2447 error = gfs2_meta_inode_buffer(ip, &dibh);
2449 struct gfs2_dinode *di =
2450 (struct gfs2_dinode *)dibh->b_data;
2451 gfs2_trans_add_meta(ip->i_gl, dibh);
2452 di->di_goal_meta = di->di_goal_data =
2453 cpu_to_be64(ip->i_goal);
2457 spin_lock(&rbm.rgd->rd_rsspin);
2458 gfs2_adjust_reservation(ip, &rbm, *nblocks);
2459 if (rbm.rgd->rd_free < *nblocks || rbm.rgd->rd_reserved < *nblocks) {
2460 fs_warn(sdp, "nblocks=%u\n", *nblocks);
2461 spin_unlock(&rbm.rgd->rd_rsspin);
2464 BUG_ON(rbm.rgd->rd_reserved < *nblocks);
2465 BUG_ON(rbm.rgd->rd_free_clone < *nblocks);
2466 BUG_ON(rbm.rgd->rd_free < *nblocks);
2467 rbm.rgd->rd_reserved -= *nblocks;
2468 rbm.rgd->rd_free_clone -= *nblocks;
2469 rbm.rgd->rd_free -= *nblocks;
2470 spin_unlock(&rbm.rgd->rd_rsspin);
2472 rbm.rgd->rd_dinodes++;
2473 *generation = rbm.rgd->rd_igeneration++;
2474 if (*generation == 0)
2475 *generation = rbm.rgd->rd_igeneration++;
2478 gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2479 gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2481 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2483 gfs2_trans_remove_revoke(sdp, block, *nblocks);
2485 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2487 trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2488 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2493 gfs2_rgrp_error(rbm.rgd);
2498 * __gfs2_free_blocks - free a contiguous run of block(s)
2499 * @ip: the inode these blocks are being freed from
2500 * @rgd: the resource group the blocks are in
2501 * @bstart: first block of a run of contiguous blocks
2502 * @blen: the length of the block run
2503 * @meta: 1 if the blocks represent metadata
2507 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2508 u64 bstart, u32 blen, int meta)
2510 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2512 rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2513 trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2514 rgd->rd_free += blen;
2515 rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2516 gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2517 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2519 /* Directories keep their data in the metadata address space */
2520 if (meta || ip->i_depth || gfs2_is_jdata(ip))
2521 gfs2_journal_wipe(ip, bstart, blen);
2525 * gfs2_free_meta - free a contiguous run of data block(s)
2526 * @ip: the inode these blocks are being freed from
2527 * @rgd: the resource group the blocks are in
2528 * @bstart: first block of a run of contiguous blocks
2529 * @blen: the length of the block run
2533 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2534 u64 bstart, u32 blen)
2536 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2538 __gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2539 gfs2_statfs_change(sdp, 0, +blen, 0);
2540 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2543 void gfs2_unlink_di(struct inode *inode)
2545 struct gfs2_inode *ip = GFS2_I(inode);
2546 struct gfs2_sbd *sdp = GFS2_SB(inode);
2547 struct gfs2_rgrpd *rgd;
2548 u64 blkno = ip->i_no_addr;
2550 rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2553 rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2554 trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2555 gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2556 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2557 be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2560 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2562 struct gfs2_sbd *sdp = rgd->rd_sbd;
2564 rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2565 if (!rgd->rd_dinodes)
2566 gfs2_consist_rgrpd(rgd);
2570 gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2571 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2572 be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2574 gfs2_statfs_change(sdp, 0, +1, -1);
2575 trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2576 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2577 gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2581 * gfs2_check_blk_type - Check the type of a block
2582 * @sdp: The superblock
2583 * @no_addr: The block number to check
2584 * @type: The block type we are looking for
2586 * Returns: 0 if the block type matches the expected type
2587 * -ESTALE if it doesn't match
2588 * or -ve errno if something went wrong while checking
2591 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2593 struct gfs2_rgrpd *rgd;
2594 struct gfs2_holder rgd_gh;
2595 struct gfs2_rbm rbm;
2596 int error = -EINVAL;
2598 rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2602 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2607 error = gfs2_rbm_from_block(&rbm, no_addr);
2608 if (!WARN_ON_ONCE(error)) {
2609 if (gfs2_testbit(&rbm, false) != type)
2613 gfs2_glock_dq_uninit(&rgd_gh);
2620 * gfs2_rlist_add - add a RG to a list of RGs
2622 * @rlist: the list of resource groups
2625 * Figure out what RG a block belongs to and add that RG to the list
2627 * FIXME: Don't use NOFAIL
2631 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2634 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2635 struct gfs2_rgrpd *rgd;
2636 struct gfs2_rgrpd **tmp;
2637 unsigned int new_space;
2640 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2644 * The resource group last accessed is kept in the last position.
2647 if (rlist->rl_rgrps) {
2648 rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2649 if (rgrp_contains_block(rgd, block))
2651 rgd = gfs2_blk2rgrpd(sdp, block, 1);
2653 rgd = ip->i_res.rs_rgd;
2654 if (!rgd || !rgrp_contains_block(rgd, block))
2655 rgd = gfs2_blk2rgrpd(sdp, block, 1);
2659 fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2660 (unsigned long long)block);
2664 for (x = 0; x < rlist->rl_rgrps; x++) {
2665 if (rlist->rl_rgd[x] == rgd) {
2666 swap(rlist->rl_rgd[x],
2667 rlist->rl_rgd[rlist->rl_rgrps - 1]);
2672 if (rlist->rl_rgrps == rlist->rl_space) {
2673 new_space = rlist->rl_space + 10;
2675 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2676 GFP_NOFS | __GFP_NOFAIL);
2678 if (rlist->rl_rgd) {
2679 memcpy(tmp, rlist->rl_rgd,
2680 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2681 kfree(rlist->rl_rgd);
2684 rlist->rl_space = new_space;
2685 rlist->rl_rgd = tmp;
2688 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2692 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2693 * and initialize an array of glock holders for them
2694 * @rlist: the list of resource groups
2696 * FIXME: Don't use NOFAIL
2700 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2704 rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2705 sizeof(struct gfs2_holder),
2706 GFP_NOFS | __GFP_NOFAIL);
2707 for (x = 0; x < rlist->rl_rgrps; x++)
2708 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2714 * gfs2_rlist_free - free a resource group list
2715 * @rlist: the list of resource groups
2719 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2723 kfree(rlist->rl_rgd);
2725 if (rlist->rl_ghs) {
2726 for (x = 0; x < rlist->rl_rgrps; x++)
2727 gfs2_holder_uninit(&rlist->rl_ghs[x]);
2728 kfree(rlist->rl_ghs);
2729 rlist->rl_ghs = NULL;