1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_btree.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_alloc.h"
20 #include "xfs_extent_busy.h"
21 #include "xfs_errortag.h"
22 #include "xfs_error.h"
23 #include "xfs_trace.h"
24 #include "xfs_trans.h"
25 #include "xfs_buf_item.h"
27 #include "xfs_ag_resv.h"
30 extern kmem_zone_t *xfs_bmap_free_item_zone;
32 struct workqueue_struct *xfs_alloc_wq;
34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
36 #define XFSA_FIXUP_BNO_OK 1
37 #define XFSA_FIXUP_CNT_OK 2
39 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
40 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
41 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
44 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
45 * the beginning of the block for a proper header with the location information
52 unsigned int size = mp->m_sb.sb_sectsize;
54 if (xfs_sb_version_hascrc(&mp->m_sb))
55 size -= sizeof(struct xfs_agfl);
57 return size / sizeof(xfs_agblock_t);
64 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
65 return XFS_RMAP_BLOCK(mp) + 1;
66 if (xfs_sb_version_hasfinobt(&mp->m_sb))
67 return XFS_FIBT_BLOCK(mp) + 1;
68 return XFS_IBT_BLOCK(mp) + 1;
75 if (xfs_sb_version_hasreflink(&mp->m_sb))
76 return xfs_refc_block(mp) + 1;
77 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
78 return XFS_RMAP_BLOCK(mp) + 1;
79 if (xfs_sb_version_hasfinobt(&mp->m_sb))
80 return XFS_FIBT_BLOCK(mp) + 1;
81 return XFS_IBT_BLOCK(mp) + 1;
85 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
86 * AGF buffer (PV 947395), we place constraints on the relationship among
87 * actual allocations for data blocks, freelist blocks, and potential file data
88 * bmap btree blocks. However, these restrictions may result in no actual space
89 * allocated for a delayed extent, for example, a data block in a certain AG is
90 * allocated but there is no additional block for the additional bmap btree
91 * block due to a split of the bmap btree of the file. The result of this may
92 * lead to an infinite loop when the file gets flushed to disk and all delayed
93 * extents need to be actually allocated. To get around this, we explicitly set
94 * aside a few blocks which will not be reserved in delayed allocation.
96 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
97 * potential split of the file's bmap btree.
101 struct xfs_mount *mp)
103 return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
107 * When deciding how much space to allocate out of an AG, we limit the
108 * allocation maximum size to the size the AG. However, we cannot use all the
109 * blocks in the AG - some are permanently used by metadata. These
110 * blocks are generally:
111 * - the AG superblock, AGF, AGI and AGFL
112 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
113 * the AGI free inode and rmap btree root blocks.
114 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
115 * - the rmapbt root block
117 * The AG headers are sector sized, so the amount of space they take up is
118 * dependent on filesystem geometry. The others are all single blocks.
121 xfs_alloc_ag_max_usable(
122 struct xfs_mount *mp)
126 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
127 blocks += XFS_ALLOC_AGFL_RESERVE;
128 blocks += 3; /* AGF, AGI btree root blocks */
129 if (xfs_sb_version_hasfinobt(&mp->m_sb))
130 blocks++; /* finobt root block */
131 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
132 blocks++; /* rmap root block */
133 if (xfs_sb_version_hasreflink(&mp->m_sb))
134 blocks++; /* refcount root block */
136 return mp->m_sb.sb_agblocks - blocks;
140 * Lookup the record equal to [bno, len] in the btree given by cur.
142 STATIC int /* error */
144 struct xfs_btree_cur *cur, /* btree cursor */
145 xfs_agblock_t bno, /* starting block of extent */
146 xfs_extlen_t len, /* length of extent */
147 int *stat) /* success/failure */
151 cur->bc_rec.a.ar_startblock = bno;
152 cur->bc_rec.a.ar_blockcount = len;
153 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
154 cur->bc_ag.abt.active = (*stat == 1);
159 * Lookup the first record greater than or equal to [bno, len]
160 * in the btree given by cur.
164 struct xfs_btree_cur *cur, /* btree cursor */
165 xfs_agblock_t bno, /* starting block of extent */
166 xfs_extlen_t len, /* length of extent */
167 int *stat) /* success/failure */
171 cur->bc_rec.a.ar_startblock = bno;
172 cur->bc_rec.a.ar_blockcount = len;
173 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
174 cur->bc_ag.abt.active = (*stat == 1);
179 * Lookup the first record less than or equal to [bno, len]
180 * in the btree given by cur.
184 struct xfs_btree_cur *cur, /* btree cursor */
185 xfs_agblock_t bno, /* starting block of extent */
186 xfs_extlen_t len, /* length of extent */
187 int *stat) /* success/failure */
190 cur->bc_rec.a.ar_startblock = bno;
191 cur->bc_rec.a.ar_blockcount = len;
192 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
193 cur->bc_ag.abt.active = (*stat == 1);
198 xfs_alloc_cur_active(
199 struct xfs_btree_cur *cur)
201 return cur && cur->bc_ag.abt.active;
205 * Update the record referred to by cur to the value given
207 * This either works (return 0) or gets an EFSCORRUPTED error.
209 STATIC int /* error */
211 struct xfs_btree_cur *cur, /* btree cursor */
212 xfs_agblock_t bno, /* starting block of extent */
213 xfs_extlen_t len) /* length of extent */
215 union xfs_btree_rec rec;
217 rec.alloc.ar_startblock = cpu_to_be32(bno);
218 rec.alloc.ar_blockcount = cpu_to_be32(len);
219 return xfs_btree_update(cur, &rec);
223 * Get the data from the pointed-to record.
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t *bno, /* output: starting block of extent */
229 xfs_extlen_t *len, /* output: length of extent */
230 int *stat) /* output: success/failure */
232 struct xfs_mount *mp = cur->bc_mp;
233 xfs_agnumber_t agno = cur->bc_ag.agno;
234 union xfs_btree_rec *rec;
237 error = xfs_btree_get_rec(cur, &rec, stat);
238 if (error || !(*stat))
241 *bno = be32_to_cpu(rec->alloc.ar_startblock);
242 *len = be32_to_cpu(rec->alloc.ar_blockcount);
247 /* check for valid extent range, including overflow */
248 if (!xfs_verify_agbno(mp, agno, *bno))
250 if (*bno > *bno + *len)
252 if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
259 "%s Freespace BTree record corruption in AG %d detected!",
260 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
262 "start block 0x%x block count 0x%x", *bno, *len);
263 return -EFSCORRUPTED;
267 * Compute aligned version of the found extent.
268 * Takes alignment and min length into account.
271 xfs_alloc_compute_aligned(
272 xfs_alloc_arg_t *args, /* allocation argument structure */
273 xfs_agblock_t foundbno, /* starting block in found extent */
274 xfs_extlen_t foundlen, /* length in found extent */
275 xfs_agblock_t *resbno, /* result block number */
276 xfs_extlen_t *reslen, /* result length */
279 xfs_agblock_t bno = foundbno;
280 xfs_extlen_t len = foundlen;
284 /* Trim busy sections out of found extent */
285 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
288 * If we have a largish extent that happens to start before min_agbno,
289 * see if we can shift it into range...
291 if (bno < args->min_agbno && bno + len > args->min_agbno) {
292 diff = args->min_agbno - bno;
299 if (args->alignment > 1 && len >= args->minlen) {
300 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
302 diff = aligned_bno - bno;
304 *resbno = aligned_bno;
305 *reslen = diff >= len ? 0 : len - diff;
315 * Compute best start block and diff for "near" allocations.
316 * freelen >= wantlen already checked by caller.
318 STATIC xfs_extlen_t /* difference value (absolute) */
319 xfs_alloc_compute_diff(
320 xfs_agblock_t wantbno, /* target starting block */
321 xfs_extlen_t wantlen, /* target length */
322 xfs_extlen_t alignment, /* target alignment */
323 int datatype, /* are we allocating data? */
324 xfs_agblock_t freebno, /* freespace's starting block */
325 xfs_extlen_t freelen, /* freespace's length */
326 xfs_agblock_t *newbnop) /* result: best start block from free */
328 xfs_agblock_t freeend; /* end of freespace extent */
329 xfs_agblock_t newbno1; /* return block number */
330 xfs_agblock_t newbno2; /* other new block number */
331 xfs_extlen_t newlen1=0; /* length with newbno1 */
332 xfs_extlen_t newlen2=0; /* length with newbno2 */
333 xfs_agblock_t wantend; /* end of target extent */
334 bool userdata = datatype & XFS_ALLOC_USERDATA;
336 ASSERT(freelen >= wantlen);
337 freeend = freebno + freelen;
338 wantend = wantbno + wantlen;
340 * We want to allocate from the start of a free extent if it is past
341 * the desired block or if we are allocating user data and the free
342 * extent is before desired block. The second case is there to allow
343 * for contiguous allocation from the remaining free space if the file
344 * grows in the short term.
346 if (freebno >= wantbno || (userdata && freeend < wantend)) {
347 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
348 newbno1 = NULLAGBLOCK;
349 } else if (freeend >= wantend && alignment > 1) {
350 newbno1 = roundup(wantbno, alignment);
351 newbno2 = newbno1 - alignment;
352 if (newbno1 >= freeend)
353 newbno1 = NULLAGBLOCK;
355 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
356 if (newbno2 < freebno)
357 newbno2 = NULLAGBLOCK;
359 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
360 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
361 if (newlen1 < newlen2 ||
362 (newlen1 == newlen2 &&
363 XFS_ABSDIFF(newbno1, wantbno) >
364 XFS_ABSDIFF(newbno2, wantbno)))
366 } else if (newbno2 != NULLAGBLOCK)
368 } else if (freeend >= wantend) {
370 } else if (alignment > 1) {
371 newbno1 = roundup(freeend - wantlen, alignment);
372 if (newbno1 > freeend - wantlen &&
373 newbno1 - alignment >= freebno)
374 newbno1 -= alignment;
375 else if (newbno1 >= freeend)
376 newbno1 = NULLAGBLOCK;
378 newbno1 = freeend - wantlen;
380 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
384 * Fix up the length, based on mod and prod.
385 * len should be k * prod + mod for some k.
386 * If len is too small it is returned unchanged.
387 * If len hits maxlen it is left alone.
391 xfs_alloc_arg_t *args) /* allocation argument structure */
396 ASSERT(args->mod < args->prod);
398 ASSERT(rlen >= args->minlen);
399 ASSERT(rlen <= args->maxlen);
400 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
401 (args->mod == 0 && rlen < args->prod))
403 k = rlen % args->prod;
407 rlen = rlen - (k - args->mod);
409 rlen = rlen - args->prod + (args->mod - k);
410 /* casts to (int) catch length underflows */
411 if ((int)rlen < (int)args->minlen)
413 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
414 ASSERT(rlen % args->prod == args->mod);
415 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
416 rlen + args->minleft);
421 * Update the two btrees, logically removing from freespace the extent
422 * starting at rbno, rlen blocks. The extent is contained within the
423 * actual (current) free extent fbno for flen blocks.
424 * Flags are passed in indicating whether the cursors are set to the
427 STATIC int /* error code */
428 xfs_alloc_fixup_trees(
429 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
430 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
431 xfs_agblock_t fbno, /* starting block of free extent */
432 xfs_extlen_t flen, /* length of free extent */
433 xfs_agblock_t rbno, /* starting block of returned extent */
434 xfs_extlen_t rlen, /* length of returned extent */
435 int flags) /* flags, XFSA_FIXUP_... */
437 int error; /* error code */
438 int i; /* operation results */
439 xfs_agblock_t nfbno1; /* first new free startblock */
440 xfs_agblock_t nfbno2; /* second new free startblock */
441 xfs_extlen_t nflen1=0; /* first new free length */
442 xfs_extlen_t nflen2=0; /* second new free length */
443 struct xfs_mount *mp;
448 * Look up the record in the by-size tree if necessary.
450 if (flags & XFSA_FIXUP_CNT_OK) {
452 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
454 if (XFS_IS_CORRUPT(mp,
458 return -EFSCORRUPTED;
461 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
463 if (XFS_IS_CORRUPT(mp, i != 1))
464 return -EFSCORRUPTED;
467 * Look up the record in the by-block tree if necessary.
469 if (flags & XFSA_FIXUP_BNO_OK) {
471 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
473 if (XFS_IS_CORRUPT(mp,
477 return -EFSCORRUPTED;
480 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
482 if (XFS_IS_CORRUPT(mp, i != 1))
483 return -EFSCORRUPTED;
487 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
488 struct xfs_btree_block *bnoblock;
489 struct xfs_btree_block *cntblock;
491 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
492 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
494 if (XFS_IS_CORRUPT(mp,
495 bnoblock->bb_numrecs !=
496 cntblock->bb_numrecs))
497 return -EFSCORRUPTED;
502 * Deal with all four cases: the allocated record is contained
503 * within the freespace record, so we can have new freespace
504 * at either (or both) end, or no freespace remaining.
506 if (rbno == fbno && rlen == flen)
507 nfbno1 = nfbno2 = NULLAGBLOCK;
508 else if (rbno == fbno) {
509 nfbno1 = rbno + rlen;
510 nflen1 = flen - rlen;
511 nfbno2 = NULLAGBLOCK;
512 } else if (rbno + rlen == fbno + flen) {
514 nflen1 = flen - rlen;
515 nfbno2 = NULLAGBLOCK;
518 nflen1 = rbno - fbno;
519 nfbno2 = rbno + rlen;
520 nflen2 = (fbno + flen) - nfbno2;
523 * Delete the entry from the by-size btree.
525 if ((error = xfs_btree_delete(cnt_cur, &i)))
527 if (XFS_IS_CORRUPT(mp, i != 1))
528 return -EFSCORRUPTED;
530 * Add new by-size btree entry(s).
532 if (nfbno1 != NULLAGBLOCK) {
533 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
535 if (XFS_IS_CORRUPT(mp, i != 0))
536 return -EFSCORRUPTED;
537 if ((error = xfs_btree_insert(cnt_cur, &i)))
539 if (XFS_IS_CORRUPT(mp, i != 1))
540 return -EFSCORRUPTED;
542 if (nfbno2 != NULLAGBLOCK) {
543 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
545 if (XFS_IS_CORRUPT(mp, i != 0))
546 return -EFSCORRUPTED;
547 if ((error = xfs_btree_insert(cnt_cur, &i)))
549 if (XFS_IS_CORRUPT(mp, i != 1))
550 return -EFSCORRUPTED;
553 * Fix up the by-block btree entry(s).
555 if (nfbno1 == NULLAGBLOCK) {
557 * No remaining freespace, just delete the by-block tree entry.
559 if ((error = xfs_btree_delete(bno_cur, &i)))
561 if (XFS_IS_CORRUPT(mp, i != 1))
562 return -EFSCORRUPTED;
565 * Update the by-block entry to start later|be shorter.
567 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
570 if (nfbno2 != NULLAGBLOCK) {
572 * 2 resulting free entries, need to add one.
574 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
576 if (XFS_IS_CORRUPT(mp, i != 0))
577 return -EFSCORRUPTED;
578 if ((error = xfs_btree_insert(bno_cur, &i)))
580 if (XFS_IS_CORRUPT(mp, i != 1))
581 return -EFSCORRUPTED;
586 static xfs_failaddr_t
590 struct xfs_mount *mp = bp->b_mount;
591 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
592 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
596 * There is no verification of non-crc AGFLs because mkfs does not
597 * initialise the AGFL to zero or NULL. Hence the only valid part of the
598 * AGFL is what the AGF says is active. We can't get to the AGF, so we
599 * can't verify just those entries are valid.
601 if (!xfs_sb_version_hascrc(&mp->m_sb))
604 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
605 return __this_address;
606 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
607 return __this_address;
609 * during growfs operations, the perag is not fully initialised,
610 * so we can't use it for any useful checking. growfs ensures we can't
611 * use it by using uncached buffers that don't have the perag attached
612 * so we can detect and avoid this problem.
614 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
615 return __this_address;
617 for (i = 0; i < xfs_agfl_size(mp); i++) {
618 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
619 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
620 return __this_address;
623 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
624 return __this_address;
629 xfs_agfl_read_verify(
632 struct xfs_mount *mp = bp->b_mount;
636 * There is no verification of non-crc AGFLs because mkfs does not
637 * initialise the AGFL to zero or NULL. Hence the only valid part of the
638 * AGFL is what the AGF says is active. We can't get to the AGF, so we
639 * can't verify just those entries are valid.
641 if (!xfs_sb_version_hascrc(&mp->m_sb))
644 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
645 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
647 fa = xfs_agfl_verify(bp);
649 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
654 xfs_agfl_write_verify(
657 struct xfs_mount *mp = bp->b_mount;
658 struct xfs_buf_log_item *bip = bp->b_log_item;
661 /* no verification of non-crc AGFLs */
662 if (!xfs_sb_version_hascrc(&mp->m_sb))
665 fa = xfs_agfl_verify(bp);
667 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
672 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
674 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
677 const struct xfs_buf_ops xfs_agfl_buf_ops = {
679 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
680 .verify_read = xfs_agfl_read_verify,
681 .verify_write = xfs_agfl_write_verify,
682 .verify_struct = xfs_agfl_verify,
686 * Read in the allocation group free block array.
690 xfs_mount_t *mp, /* mount point structure */
691 xfs_trans_t *tp, /* transaction pointer */
692 xfs_agnumber_t agno, /* allocation group number */
693 struct xfs_buf **bpp) /* buffer for the ag free block array */
695 struct xfs_buf *bp; /* return value */
698 ASSERT(agno != NULLAGNUMBER);
699 error = xfs_trans_read_buf(
700 mp, tp, mp->m_ddev_targp,
701 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
702 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
705 xfs_buf_set_ref(bp, XFS_AGFL_REF);
711 xfs_alloc_update_counters(
712 struct xfs_trans *tp,
713 struct xfs_buf *agbp,
716 struct xfs_agf *agf = agbp->b_addr;
718 agbp->b_pag->pagf_freeblks += len;
719 be32_add_cpu(&agf->agf_freeblks, len);
721 xfs_trans_agblocks_delta(tp, len);
722 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
723 be32_to_cpu(agf->agf_length))) {
724 xfs_buf_mark_corrupt(agbp);
725 return -EFSCORRUPTED;
728 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
733 * Block allocation algorithm and data structures.
735 struct xfs_alloc_cur {
736 struct xfs_btree_cur *cnt; /* btree cursors */
737 struct xfs_btree_cur *bnolt;
738 struct xfs_btree_cur *bnogt;
739 xfs_extlen_t cur_len;/* current search length */
740 xfs_agblock_t rec_bno;/* extent startblock */
741 xfs_extlen_t rec_len;/* extent length */
742 xfs_agblock_t bno; /* alloc bno */
743 xfs_extlen_t len; /* alloc len */
744 xfs_extlen_t diff; /* diff from search bno */
745 unsigned int busy_gen;/* busy state */
750 * Set up cursors, etc. in the extent allocation cursor. This function can be
751 * called multiple times to reset an initialized structure without having to
752 * reallocate cursors.
756 struct xfs_alloc_arg *args,
757 struct xfs_alloc_cur *acur)
762 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
764 acur->cur_len = args->maxlen;
774 * Perform an initial cntbt lookup to check for availability of maxlen
775 * extents. If this fails, we'll return -ENOSPC to signal the caller to
776 * attempt a small allocation.
779 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
780 args->agbp, args->agno, XFS_BTNUM_CNT);
781 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
786 * Allocate the bnobt left and right search cursors.
789 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
790 args->agbp, args->agno, XFS_BTNUM_BNO);
792 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
793 args->agbp, args->agno, XFS_BTNUM_BNO);
794 return i == 1 ? 0 : -ENOSPC;
799 struct xfs_alloc_cur *acur,
802 int cur_error = XFS_BTREE_NOERROR;
805 cur_error = XFS_BTREE_ERROR;
808 xfs_btree_del_cursor(acur->cnt, cur_error);
810 xfs_btree_del_cursor(acur->bnolt, cur_error);
812 xfs_btree_del_cursor(acur->bnogt, cur_error);
813 acur->cnt = acur->bnolt = acur->bnogt = NULL;
817 * Check an extent for allocation and track the best available candidate in the
818 * allocation structure. The cursor is deactivated if it has entered an out of
819 * range state based on allocation arguments. Optionally return the extent
820 * extent geometry and allocation status if requested by the caller.
824 struct xfs_alloc_arg *args,
825 struct xfs_alloc_cur *acur,
826 struct xfs_btree_cur *cur,
830 xfs_agblock_t bno, bnoa, bnew;
831 xfs_extlen_t len, lena, diff = -1;
833 unsigned busy_gen = 0;
834 bool deactivate = false;
835 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
839 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
842 if (XFS_IS_CORRUPT(args->mp, i != 1))
843 return -EFSCORRUPTED;
846 * Check minlen and deactivate a cntbt cursor if out of acceptable size
847 * range (i.e., walking backwards looking for a minlen extent).
849 if (len < args->minlen) {
850 deactivate = !isbnobt;
854 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
858 acur->busy_gen = busy_gen;
859 /* deactivate a bnobt cursor outside of locality range */
860 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
861 deactivate = isbnobt;
864 if (lena < args->minlen)
867 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
868 xfs_alloc_fix_len(args);
869 ASSERT(args->len >= args->minlen);
870 if (args->len < acur->len)
874 * We have an aligned record that satisfies minlen and beats or matches
875 * the candidate extent size. Compare locality for near allocation mode.
877 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
878 diff = xfs_alloc_compute_diff(args->agbno, args->len,
879 args->alignment, args->datatype,
881 if (bnew == NULLAGBLOCK)
885 * Deactivate a bnobt cursor with worse locality than the current best.
887 if (diff > acur->diff) {
888 deactivate = isbnobt;
892 ASSERT(args->len > acur->len ||
893 (args->len == acur->len && diff <= acur->diff));
897 acur->len = args->len;
902 * We're done if we found a perfect allocation. This only deactivates
903 * the current cursor, but this is just an optimization to terminate a
904 * cntbt search that otherwise runs to the edge of the tree.
906 if (acur->diff == 0 && acur->len == args->maxlen)
910 cur->bc_ag.abt.active = false;
911 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
917 * Complete an allocation of a candidate extent. Remove the extent from both
918 * trees and update the args structure.
921 xfs_alloc_cur_finish(
922 struct xfs_alloc_arg *args,
923 struct xfs_alloc_cur *acur)
925 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
928 ASSERT(acur->cnt && acur->bnolt);
929 ASSERT(acur->bno >= acur->rec_bno);
930 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
931 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
933 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
934 acur->rec_len, acur->bno, acur->len, 0);
938 args->agbno = acur->bno;
939 args->len = acur->len;
942 trace_xfs_alloc_cur(args);
947 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
948 * bno optimized lookup to search for extents with ideal size and locality.
951 xfs_alloc_cntbt_iter(
952 struct xfs_alloc_arg *args,
953 struct xfs_alloc_cur *acur)
955 struct xfs_btree_cur *cur = acur->cnt;
957 xfs_extlen_t len, cur_len;
961 if (!xfs_alloc_cur_active(cur))
964 /* locality optimized lookup */
965 cur_len = acur->cur_len;
966 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
971 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
975 /* check the current record and update search length from it */
976 error = xfs_alloc_cur_check(args, acur, cur, &i);
979 ASSERT(len >= acur->cur_len);
983 * We looked up the first record >= [agbno, len] above. The agbno is a
984 * secondary key and so the current record may lie just before or after
985 * agbno. If it is past agbno, check the previous record too so long as
986 * the length matches as it may be closer. Don't check a smaller record
987 * because that could deactivate our cursor.
989 if (bno > args->agbno) {
990 error = xfs_btree_decrement(cur, 0, &i);
992 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
993 if (!error && i && len == acur->cur_len)
994 error = xfs_alloc_cur_check(args, acur, cur,
1002 * Increment the search key until we find at least one allocation
1003 * candidate or if the extent we found was larger. Otherwise, double the
1004 * search key to optimize the search. Efficiency is more important here
1005 * than absolute best locality.
1008 if (!acur->len || acur->cur_len >= cur_len)
1011 acur->cur_len = cur_len;
1017 * Deal with the case where only small freespaces remain. Either return the
1018 * contents of the last freespace record, or allocate space from the freelist if
1019 * there is nothing in the tree.
1021 STATIC int /* error */
1022 xfs_alloc_ag_vextent_small(
1023 struct xfs_alloc_arg *args, /* allocation argument structure */
1024 struct xfs_btree_cur *ccur, /* optional by-size cursor */
1025 xfs_agblock_t *fbnop, /* result block number */
1026 xfs_extlen_t *flenp, /* result length */
1027 int *stat) /* status: 0-freelist, 1-normal/none */
1029 struct xfs_agf *agf = args->agbp->b_addr;
1031 xfs_agblock_t fbno = NULLAGBLOCK;
1032 xfs_extlen_t flen = 0;
1036 * If a cntbt cursor is provided, try to allocate the largest record in
1037 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1038 * allocation. Make sure to respect minleft even when pulling from the
1042 error = xfs_btree_decrement(ccur, 0, &i);
1046 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1049 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1050 error = -EFSCORRUPTED;
1056 if (args->minlen != 1 || args->alignment != 1 ||
1057 args->resv == XFS_AG_RESV_AGFL ||
1058 be32_to_cpu(agf->agf_flcount) <= args->minleft)
1061 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1064 if (fbno == NULLAGBLOCK)
1067 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1068 (args->datatype & XFS_ALLOC_NOBUSY));
1070 if (args->datatype & XFS_ALLOC_USERDATA) {
1073 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1074 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1075 args->mp->m_bsize, 0, &bp);
1078 xfs_trans_binval(args->tp, bp);
1080 *fbnop = args->agbno = fbno;
1081 *flenp = args->len = 1;
1082 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1083 error = -EFSCORRUPTED;
1086 args->wasfromfl = 1;
1087 trace_xfs_alloc_small_freelist(args);
1090 * If we're feeding an AGFL block to something that doesn't live in the
1091 * free space, we need to clear out the OWN_AG rmap.
1093 error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
1094 &XFS_RMAP_OINFO_AG);
1103 * Can't do the allocation, give up.
1105 if (flen < args->minlen) {
1106 args->agbno = NULLAGBLOCK;
1107 trace_xfs_alloc_small_notenough(args);
1113 trace_xfs_alloc_small_done(args);
1117 trace_xfs_alloc_small_error(args);
1122 * Allocate a variable extent in the allocation group agno.
1123 * Type and bno are used to determine where in the allocation group the
1124 * extent will start.
1125 * Extent's length (returned in *len) will be between minlen and maxlen,
1126 * and of the form k * prod + mod unless there's nothing that large.
1127 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1129 STATIC int /* error */
1130 xfs_alloc_ag_vextent(
1131 xfs_alloc_arg_t *args) /* argument structure for allocation */
1135 ASSERT(args->minlen > 0);
1136 ASSERT(args->maxlen > 0);
1137 ASSERT(args->minlen <= args->maxlen);
1138 ASSERT(args->mod < args->prod);
1139 ASSERT(args->alignment > 0);
1142 * Branch to correct routine based on the type.
1144 args->wasfromfl = 0;
1145 switch (args->type) {
1146 case XFS_ALLOCTYPE_THIS_AG:
1147 error = xfs_alloc_ag_vextent_size(args);
1149 case XFS_ALLOCTYPE_NEAR_BNO:
1150 error = xfs_alloc_ag_vextent_near(args);
1152 case XFS_ALLOCTYPE_THIS_BNO:
1153 error = xfs_alloc_ag_vextent_exact(args);
1160 if (error || args->agbno == NULLAGBLOCK)
1163 ASSERT(args->len >= args->minlen);
1164 ASSERT(args->len <= args->maxlen);
1165 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1166 ASSERT(args->agbno % args->alignment == 0);
1168 /* if not file data, insert new block into the reverse map btree */
1169 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1170 error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
1171 args->agbno, args->len, &args->oinfo);
1176 if (!args->wasfromfl) {
1177 error = xfs_alloc_update_counters(args->tp, args->agbp,
1178 -((long)(args->len)));
1182 ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
1183 args->agbno, args->len));
1186 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1188 XFS_STATS_INC(args->mp, xs_allocx);
1189 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1194 * Allocate a variable extent at exactly agno/bno.
1195 * Extent's length (returned in *len) will be between minlen and maxlen,
1196 * and of the form k * prod + mod unless there's nothing that large.
1197 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1199 STATIC int /* error */
1200 xfs_alloc_ag_vextent_exact(
1201 xfs_alloc_arg_t *args) /* allocation argument structure */
1203 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1204 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
1205 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
1207 xfs_agblock_t fbno; /* start block of found extent */
1208 xfs_extlen_t flen; /* length of found extent */
1209 xfs_agblock_t tbno; /* start block of busy extent */
1210 xfs_extlen_t tlen; /* length of busy extent */
1211 xfs_agblock_t tend; /* end block of busy extent */
1212 int i; /* success/failure of operation */
1215 ASSERT(args->alignment == 1);
1218 * Allocate/initialize a cursor for the by-number freespace btree.
1220 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1221 args->agno, XFS_BTNUM_BNO);
1224 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1225 * Look for the closest free block <= bno, it must contain bno
1226 * if any free block does.
1228 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1235 * Grab the freespace record.
1237 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1240 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1241 error = -EFSCORRUPTED;
1244 ASSERT(fbno <= args->agbno);
1247 * Check for overlapping busy extents.
1251 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1254 * Give up if the start of the extent is busy, or the freespace isn't
1255 * long enough for the minimum request.
1257 if (tbno > args->agbno)
1259 if (tlen < args->minlen)
1262 if (tend < args->agbno + args->minlen)
1266 * End of extent will be smaller of the freespace end and the
1267 * maximal requested end.
1269 * Fix the length according to mod and prod if given.
1271 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1273 xfs_alloc_fix_len(args);
1274 ASSERT(args->agbno + args->len <= tend);
1277 * We are allocating agbno for args->len
1278 * Allocate/initialize a cursor for the by-size btree.
1280 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1281 args->agno, XFS_BTNUM_CNT);
1282 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1283 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1284 args->len, XFSA_FIXUP_BNO_OK);
1286 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1290 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1291 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1293 args->wasfromfl = 0;
1294 trace_xfs_alloc_exact_done(args);
1298 /* Didn't find it, return null. */
1299 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1300 args->agbno = NULLAGBLOCK;
1301 trace_xfs_alloc_exact_notfound(args);
1305 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1306 trace_xfs_alloc_exact_error(args);
1311 * Search a given number of btree records in a given direction. Check each
1312 * record against the good extent we've already found.
1315 xfs_alloc_walk_iter(
1316 struct xfs_alloc_arg *args,
1317 struct xfs_alloc_cur *acur,
1318 struct xfs_btree_cur *cur,
1320 bool find_one, /* quit on first candidate */
1321 int count, /* rec count (-1 for infinite) */
1330 * Search so long as the cursor is active or we find a better extent.
1331 * The cursor is deactivated if it extends beyond the range of the
1332 * current allocation candidate.
1334 while (xfs_alloc_cur_active(cur) && count) {
1335 error = xfs_alloc_cur_check(args, acur, cur, &i);
1343 if (!xfs_alloc_cur_active(cur))
1347 error = xfs_btree_increment(cur, 0, &i);
1349 error = xfs_btree_decrement(cur, 0, &i);
1353 cur->bc_ag.abt.active = false;
1363 * Search the by-bno and by-size btrees in parallel in search of an extent with
1364 * ideal locality based on the NEAR mode ->agbno locality hint.
1367 xfs_alloc_ag_vextent_locality(
1368 struct xfs_alloc_arg *args,
1369 struct xfs_alloc_cur *acur,
1372 struct xfs_btree_cur *fbcur = NULL;
1377 ASSERT(acur->len == 0);
1378 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1382 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1385 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1388 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1393 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1394 * right and lookup the closest extent to the locality hint for each
1395 * extent size key in the cntbt. The entire search terminates
1396 * immediately on a bnobt hit because that means we've found best case
1397 * locality. Otherwise the search continues until the cntbt cursor runs
1398 * off the end of the tree. If no allocation candidate is found at this
1399 * point, give up on locality, walk backwards from the end of the cntbt
1400 * and take the first available extent.
1402 * The parallel tree searches balance each other out to provide fairly
1403 * consistent performance for various situations. The bnobt search can
1404 * have pathological behavior in the worst case scenario of larger
1405 * allocation requests and fragmented free space. On the other hand, the
1406 * bnobt is able to satisfy most smaller allocation requests much more
1407 * quickly than the cntbt. The cntbt search can sift through fragmented
1408 * free space and sets of free extents for larger allocation requests
1409 * more quickly than the bnobt. Since the locality hint is just a hint
1410 * and we don't want to scan the entire bnobt for perfect locality, the
1411 * cntbt search essentially bounds the bnobt search such that we can
1412 * find good enough locality at reasonable performance in most cases.
1414 while (xfs_alloc_cur_active(acur->bnolt) ||
1415 xfs_alloc_cur_active(acur->bnogt) ||
1416 xfs_alloc_cur_active(acur->cnt)) {
1418 trace_xfs_alloc_cur_lookup(args);
1421 * Search the bnobt left and right. In the case of a hit, finish
1422 * the search in the opposite direction and we're done.
1424 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1429 trace_xfs_alloc_cur_left(args);
1430 fbcur = acur->bnogt;
1434 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1439 trace_xfs_alloc_cur_right(args);
1440 fbcur = acur->bnolt;
1446 * Check the extent with best locality based on the current
1447 * extent size search key and keep track of the best candidate.
1449 error = xfs_alloc_cntbt_iter(args, acur);
1452 if (!xfs_alloc_cur_active(acur->cnt)) {
1453 trace_xfs_alloc_cur_lookup_done(args);
1459 * If we failed to find anything due to busy extents, return empty
1460 * handed so the caller can flush and retry. If no busy extents were
1461 * found, walk backwards from the end of the cntbt as a last resort.
1463 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1464 error = xfs_btree_decrement(acur->cnt, 0, &i);
1468 acur->cnt->bc_ag.abt.active = true;
1475 * Search in the opposite direction for a better entry in the case of
1476 * a bnobt hit or walk backwards from the end of the cntbt.
1479 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1491 /* Check the last block of the cnt btree for allocations. */
1493 xfs_alloc_ag_vextent_lastblock(
1494 struct xfs_alloc_arg *args,
1495 struct xfs_alloc_cur *acur,
1504 /* Randomly don't execute the first algorithm. */
1505 if (prandom_u32() & 1)
1510 * Start from the entry that lookup found, sequence through all larger
1511 * free blocks. If we're actually pointing at a record smaller than
1512 * maxlen, go to the start of this block, and skip all those smaller
1515 if (*len || args->alignment > 1) {
1516 acur->cnt->bc_ptrs[0] = 1;
1518 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1521 if (XFS_IS_CORRUPT(args->mp, i != 1))
1522 return -EFSCORRUPTED;
1523 if (*len >= args->minlen)
1525 error = xfs_btree_increment(acur->cnt, 0, &i);
1529 ASSERT(*len >= args->minlen);
1534 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1539 * It didn't work. We COULD be in a case where there's a good record
1540 * somewhere, so try again.
1545 trace_xfs_alloc_near_first(args);
1551 * Allocate a variable extent near bno in the allocation group agno.
1552 * Extent's length (returned in len) will be between minlen and maxlen,
1553 * and of the form k * prod + mod unless there's nothing that large.
1554 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1557 xfs_alloc_ag_vextent_near(
1558 struct xfs_alloc_arg *args)
1560 struct xfs_alloc_cur acur = {};
1561 int error; /* error code */
1562 int i; /* result code, temporary */
1566 /* handle uninitialized agbno range so caller doesn't have to */
1567 if (!args->min_agbno && !args->max_agbno)
1568 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1569 ASSERT(args->min_agbno <= args->max_agbno);
1571 /* clamp agbno to the range if it's outside */
1572 if (args->agbno < args->min_agbno)
1573 args->agbno = args->min_agbno;
1574 if (args->agbno > args->max_agbno)
1575 args->agbno = args->max_agbno;
1581 * Set up cursors and see if there are any free extents as big as
1582 * maxlen. If not, pick the last entry in the tree unless the tree is
1585 error = xfs_alloc_cur_setup(args, &acur);
1586 if (error == -ENOSPC) {
1587 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1591 if (i == 0 || len == 0) {
1592 trace_xfs_alloc_near_noentry(args);
1602 * If the requested extent is large wrt the freespaces available
1603 * in this a.g., then the cursor will be pointing to a btree entry
1604 * near the right edge of the tree. If it's in the last btree leaf
1605 * block, then we just examine all the entries in that block
1606 * that are big enough, and pick the best one.
1608 if (xfs_btree_islastblock(acur.cnt, 0)) {
1609 bool allocated = false;
1611 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1620 * Second algorithm. Combined cntbt and bnobt search to find ideal
1623 error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1628 * If we couldn't get anything, give up.
1632 trace_xfs_alloc_near_busy(args);
1633 xfs_extent_busy_flush(args->mp, args->pag,
1637 trace_xfs_alloc_size_neither(args);
1638 args->agbno = NULLAGBLOCK;
1643 /* fix up btrees on a successful allocation */
1644 error = xfs_alloc_cur_finish(args, &acur);
1647 xfs_alloc_cur_close(&acur, error);
1652 * Allocate a variable extent anywhere in the allocation group agno.
1653 * Extent's length (returned in len) will be between minlen and maxlen,
1654 * and of the form k * prod + mod unless there's nothing that large.
1655 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1657 STATIC int /* error */
1658 xfs_alloc_ag_vextent_size(
1659 xfs_alloc_arg_t *args) /* allocation argument structure */
1661 struct xfs_agf *agf = args->agbp->b_addr;
1662 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
1663 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
1664 int error; /* error result */
1665 xfs_agblock_t fbno; /* start of found freespace */
1666 xfs_extlen_t flen; /* length of found freespace */
1667 int i; /* temp status variable */
1668 xfs_agblock_t rbno; /* returned block number */
1669 xfs_extlen_t rlen; /* length of returned extent */
1675 * Allocate and initialize a cursor for the by-size btree.
1677 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1678 args->agno, XFS_BTNUM_CNT);
1683 * Look for an entry >= maxlen+alignment-1 blocks.
1685 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1686 args->maxlen + args->alignment - 1, &i)))
1690 * If none then we have to settle for a smaller extent. In the case that
1691 * there are no large extents, this will return the last entry in the
1692 * tree unless the tree is empty. In the case that there are only busy
1693 * large extents, this will return the largest small extent unless there
1694 * are no smaller extents available.
1697 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1701 if (i == 0 || flen == 0) {
1702 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1703 trace_xfs_alloc_size_noentry(args);
1707 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1711 * Search for a non-busy extent that is large enough.
1714 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1717 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1718 error = -EFSCORRUPTED;
1722 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1723 &rbno, &rlen, &busy_gen);
1725 if (rlen >= args->maxlen)
1728 error = xfs_btree_increment(cnt_cur, 0, &i);
1733 * Our only valid extents must have been busy.
1734 * Make it unbusy by forcing the log out and
1737 xfs_btree_del_cursor(cnt_cur,
1739 trace_xfs_alloc_size_busy(args);
1740 xfs_extent_busy_flush(args->mp,
1741 args->pag, busy_gen);
1748 * In the first case above, we got the last entry in the
1749 * by-size btree. Now we check to see if the space hits maxlen
1750 * once aligned; if not, we search left for something better.
1751 * This can't happen in the second case above.
1753 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1754 if (XFS_IS_CORRUPT(args->mp,
1757 rbno + rlen > fbno + flen))) {
1758 error = -EFSCORRUPTED;
1761 if (rlen < args->maxlen) {
1762 xfs_agblock_t bestfbno;
1763 xfs_extlen_t bestflen;
1764 xfs_agblock_t bestrbno;
1765 xfs_extlen_t bestrlen;
1772 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1776 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1779 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1780 error = -EFSCORRUPTED;
1783 if (flen < bestrlen)
1785 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1786 &rbno, &rlen, &busy_gen);
1787 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1788 if (XFS_IS_CORRUPT(args->mp,
1791 rbno + rlen > fbno + flen))) {
1792 error = -EFSCORRUPTED;
1795 if (rlen > bestrlen) {
1800 if (rlen == args->maxlen)
1804 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1807 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1808 error = -EFSCORRUPTED;
1816 args->wasfromfl = 0;
1818 * Fix up the length.
1821 if (rlen < args->minlen) {
1823 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1824 trace_xfs_alloc_size_busy(args);
1825 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1830 xfs_alloc_fix_len(args);
1833 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1834 error = -EFSCORRUPTED;
1838 * Allocate and initialize a cursor for the by-block tree.
1840 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1841 args->agno, XFS_BTNUM_BNO);
1842 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1843 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1845 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1846 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1847 cnt_cur = bno_cur = NULL;
1850 if (XFS_IS_CORRUPT(args->mp,
1851 args->agbno + args->len >
1852 be32_to_cpu(agf->agf_length))) {
1853 error = -EFSCORRUPTED;
1856 trace_xfs_alloc_size_done(args);
1860 trace_xfs_alloc_size_error(args);
1862 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1864 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1868 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1869 trace_xfs_alloc_size_nominleft(args);
1870 args->agbno = NULLAGBLOCK;
1875 * Free the extent starting at agno/bno for length.
1879 struct xfs_trans *tp,
1880 struct xfs_buf *agbp,
1881 xfs_agnumber_t agno,
1884 const struct xfs_owner_info *oinfo,
1885 enum xfs_ag_resv_type type)
1887 struct xfs_mount *mp;
1888 struct xfs_btree_cur *bno_cur;
1889 struct xfs_btree_cur *cnt_cur;
1890 xfs_agblock_t gtbno; /* start of right neighbor */
1891 xfs_extlen_t gtlen; /* length of right neighbor */
1892 xfs_agblock_t ltbno; /* start of left neighbor */
1893 xfs_extlen_t ltlen; /* length of left neighbor */
1894 xfs_agblock_t nbno; /* new starting block of freesp */
1895 xfs_extlen_t nlen; /* new length of freespace */
1896 int haveleft; /* have a left neighbor */
1897 int haveright; /* have a right neighbor */
1901 bno_cur = cnt_cur = NULL;
1904 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1905 error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
1911 * Allocate and initialize a cursor for the by-block btree.
1913 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1915 * Look for a neighboring block on the left (lower block numbers)
1916 * that is contiguous with this space.
1918 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1922 * There is a block to our left.
1924 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i)))
1926 if (XFS_IS_CORRUPT(mp, i != 1)) {
1927 error = -EFSCORRUPTED;
1931 * It's not contiguous, though.
1933 if (ltbno + ltlen < bno)
1937 * If this failure happens the request to free this
1938 * space was invalid, it's (partly) already free.
1941 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1942 error = -EFSCORRUPTED;
1948 * Look for a neighboring block on the right (higher block numbers)
1949 * that is contiguous with this space.
1951 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1955 * There is a block to our right.
1957 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i)))
1959 if (XFS_IS_CORRUPT(mp, i != 1)) {
1960 error = -EFSCORRUPTED;
1964 * It's not contiguous, though.
1966 if (bno + len < gtbno)
1970 * If this failure happens the request to free this
1971 * space was invalid, it's (partly) already free.
1974 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1975 error = -EFSCORRUPTED;
1981 * Now allocate and initialize a cursor for the by-size tree.
1983 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1985 * Have both left and right contiguous neighbors.
1986 * Merge all three into a single free block.
1988 if (haveleft && haveright) {
1990 * Delete the old by-size entry on the left.
1992 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1994 if (XFS_IS_CORRUPT(mp, i != 1)) {
1995 error = -EFSCORRUPTED;
1998 if ((error = xfs_btree_delete(cnt_cur, &i)))
2000 if (XFS_IS_CORRUPT(mp, i != 1)) {
2001 error = -EFSCORRUPTED;
2005 * Delete the old by-size entry on the right.
2007 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2009 if (XFS_IS_CORRUPT(mp, i != 1)) {
2010 error = -EFSCORRUPTED;
2013 if ((error = xfs_btree_delete(cnt_cur, &i)))
2015 if (XFS_IS_CORRUPT(mp, i != 1)) {
2016 error = -EFSCORRUPTED;
2020 * Delete the old by-block entry for the right block.
2022 if ((error = xfs_btree_delete(bno_cur, &i)))
2024 if (XFS_IS_CORRUPT(mp, i != 1)) {
2025 error = -EFSCORRUPTED;
2029 * Move the by-block cursor back to the left neighbor.
2031 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2033 if (XFS_IS_CORRUPT(mp, i != 1)) {
2034 error = -EFSCORRUPTED;
2039 * Check that this is the right record: delete didn't
2040 * mangle the cursor.
2043 xfs_agblock_t xxbno;
2046 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2049 if (XFS_IS_CORRUPT(mp,
2053 error = -EFSCORRUPTED;
2059 * Update remaining by-block entry to the new, joined block.
2062 nlen = len + ltlen + gtlen;
2063 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2067 * Have only a left contiguous neighbor.
2068 * Merge it together with the new freespace.
2070 else if (haveleft) {
2072 * Delete the old by-size entry on the left.
2074 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2076 if (XFS_IS_CORRUPT(mp, i != 1)) {
2077 error = -EFSCORRUPTED;
2080 if ((error = xfs_btree_delete(cnt_cur, &i)))
2082 if (XFS_IS_CORRUPT(mp, i != 1)) {
2083 error = -EFSCORRUPTED;
2087 * Back up the by-block cursor to the left neighbor, and
2088 * update its length.
2090 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2092 if (XFS_IS_CORRUPT(mp, i != 1)) {
2093 error = -EFSCORRUPTED;
2098 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2102 * Have only a right contiguous neighbor.
2103 * Merge it together with the new freespace.
2105 else if (haveright) {
2107 * Delete the old by-size entry on the right.
2109 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2111 if (XFS_IS_CORRUPT(mp, i != 1)) {
2112 error = -EFSCORRUPTED;
2115 if ((error = xfs_btree_delete(cnt_cur, &i)))
2117 if (XFS_IS_CORRUPT(mp, i != 1)) {
2118 error = -EFSCORRUPTED;
2122 * Update the starting block and length of the right
2123 * neighbor in the by-block tree.
2127 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2131 * No contiguous neighbors.
2132 * Insert the new freespace into the by-block tree.
2137 if ((error = xfs_btree_insert(bno_cur, &i)))
2139 if (XFS_IS_CORRUPT(mp, i != 1)) {
2140 error = -EFSCORRUPTED;
2144 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2147 * In all cases we need to insert the new freespace in the by-size tree.
2149 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2151 if (XFS_IS_CORRUPT(mp, i != 0)) {
2152 error = -EFSCORRUPTED;
2155 if ((error = xfs_btree_insert(cnt_cur, &i)))
2157 if (XFS_IS_CORRUPT(mp, i != 1)) {
2158 error = -EFSCORRUPTED;
2161 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2165 * Update the freespace totals in the ag and superblock.
2167 error = xfs_alloc_update_counters(tp, agbp, len);
2168 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2172 XFS_STATS_INC(mp, xs_freex);
2173 XFS_STATS_ADD(mp, xs_freeb, len);
2175 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2180 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2182 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2184 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2189 * Visible (exported) allocation/free functions.
2190 * Some of these are used just by xfs_alloc_btree.c and this file.
2194 * Compute and fill in value of m_ag_maxlevels.
2197 xfs_alloc_compute_maxlevels(
2198 xfs_mount_t *mp) /* file system mount structure */
2200 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2201 (mp->m_sb.sb_agblocks + 1) / 2);
2205 * Find the length of the longest extent in an AG. The 'need' parameter
2206 * specifies how much space we're going to need for the AGFL and the
2207 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2211 xfs_alloc_longest_free_extent(
2212 struct xfs_perag *pag,
2214 xfs_extlen_t reserved)
2216 xfs_extlen_t delta = 0;
2219 * If the AGFL needs a recharge, we'll have to subtract that from the
2222 if (need > pag->pagf_flcount)
2223 delta = need - pag->pagf_flcount;
2226 * If we cannot maintain others' reservations with space from the
2227 * not-longest freesp extents, we'll have to subtract /that/ from
2228 * the longest extent too.
2230 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2231 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2234 * If the longest extent is long enough to satisfy all the
2235 * reservations and AGFL rules in place, we can return this extent.
2237 if (pag->pagf_longest > delta)
2238 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2239 pag->pagf_longest - delta);
2241 /* Otherwise, let the caller try for 1 block if there's space. */
2242 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2246 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2247 * return the largest possible minimum length.
2250 xfs_alloc_min_freelist(
2251 struct xfs_mount *mp,
2252 struct xfs_perag *pag)
2254 /* AG btrees have at least 1 level. */
2255 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2256 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
2257 unsigned int min_free;
2259 ASSERT(mp->m_ag_maxlevels > 0);
2261 /* space needed by-bno freespace btree */
2262 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2263 mp->m_ag_maxlevels);
2264 /* space needed by-size freespace btree */
2265 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2266 mp->m_ag_maxlevels);
2267 /* space needed reverse mapping used space btree */
2268 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
2269 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2270 mp->m_rmap_maxlevels);
2276 * Check if the operation we are fixing up the freelist for should go ahead or
2277 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2278 * is dependent on whether the size and shape of free space available will
2279 * permit the requested allocation to take place.
2282 xfs_alloc_space_available(
2283 struct xfs_alloc_arg *args,
2284 xfs_extlen_t min_free,
2287 struct xfs_perag *pag = args->pag;
2288 xfs_extlen_t alloc_len, longest;
2289 xfs_extlen_t reservation; /* blocks that are still reserved */
2291 xfs_extlen_t agflcount;
2293 if (flags & XFS_ALLOC_FLAG_FREEING)
2296 reservation = xfs_ag_resv_needed(pag, args->resv);
2298 /* do we have enough contiguous free space for the allocation? */
2299 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2300 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2301 if (longest < alloc_len)
2305 * Do we have enough free space remaining for the allocation? Don't
2306 * account extra agfl blocks because we are about to defer free them,
2307 * making them unavailable until the current transaction commits.
2309 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2310 available = (int)(pag->pagf_freeblks + agflcount -
2311 reservation - min_free - args->minleft);
2312 if (available < (int)max(args->total, alloc_len))
2316 * Clamp maxlen to the amount of free space available for the actual
2317 * extent allocation.
2319 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2320 args->maxlen = available;
2321 ASSERT(args->maxlen > 0);
2322 ASSERT(args->maxlen >= args->minlen);
2329 xfs_free_agfl_block(
2330 struct xfs_trans *tp,
2331 xfs_agnumber_t agno,
2332 xfs_agblock_t agbno,
2333 struct xfs_buf *agbp,
2334 struct xfs_owner_info *oinfo)
2339 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2344 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2345 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2346 tp->t_mountp->m_bsize, 0, &bp);
2349 xfs_trans_binval(tp, bp);
2355 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2356 * is to detect an agfl header padding mismatch between current and early v5
2357 * kernels. This problem manifests as a 1-slot size difference between the
2358 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2359 * may also catch variants of agfl count corruption unrelated to padding. Either
2360 * way, we'll reset the agfl and warn the user.
2362 * Return true if a reset is required before the agfl can be used, false
2366 xfs_agfl_needs_reset(
2367 struct xfs_mount *mp,
2368 struct xfs_agf *agf)
2370 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2371 uint32_t l = be32_to_cpu(agf->agf_fllast);
2372 uint32_t c = be32_to_cpu(agf->agf_flcount);
2373 int agfl_size = xfs_agfl_size(mp);
2376 /* no agfl header on v4 supers */
2377 if (!xfs_sb_version_hascrc(&mp->m_sb))
2381 * The agf read verifier catches severe corruption of these fields.
2382 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2383 * the verifier allows it.
2385 if (f >= agfl_size || l >= agfl_size)
2391 * Check consistency between the on-disk count and the active range. An
2392 * agfl padding mismatch manifests as an inconsistent flcount.
2397 active = agfl_size - f + l + 1;
2405 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2406 * agfl content cannot be trusted. Warn the user that a repair is required to
2407 * recover leaked blocks.
2409 * The purpose of this mechanism is to handle filesystems affected by the agfl
2410 * header padding mismatch problem. A reset keeps the filesystem online with a
2411 * relatively minor free space accounting inconsistency rather than suffer the
2412 * inevitable crash from use of an invalid agfl block.
2416 struct xfs_trans *tp,
2417 struct xfs_buf *agbp,
2418 struct xfs_perag *pag)
2420 struct xfs_mount *mp = tp->t_mountp;
2421 struct xfs_agf *agf = agbp->b_addr;
2423 ASSERT(pag->pagf_agflreset);
2424 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2427 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2428 "Please unmount and run xfs_repair.",
2429 pag->pag_agno, pag->pagf_flcount);
2431 agf->agf_flfirst = 0;
2432 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2433 agf->agf_flcount = 0;
2434 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2437 pag->pagf_flcount = 0;
2438 pag->pagf_agflreset = false;
2442 * Defer an AGFL block free. This is effectively equivalent to
2443 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2445 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2446 * allocation operations in a transaction. AGFL frees are prone to this problem
2447 * because for one they are always freed one at a time. Further, an immediate
2448 * AGFL block free can cause a btree join and require another block free before
2449 * the real allocation can proceed. Deferring the free disconnects freeing up
2450 * the AGFL slot from freeing the block.
2453 xfs_defer_agfl_block(
2454 struct xfs_trans *tp,
2455 xfs_agnumber_t agno,
2456 xfs_fsblock_t agbno,
2457 struct xfs_owner_info *oinfo)
2459 struct xfs_mount *mp = tp->t_mountp;
2460 struct xfs_extent_free_item *new; /* new element */
2462 ASSERT(xfs_bmap_free_item_zone != NULL);
2463 ASSERT(oinfo != NULL);
2465 new = kmem_cache_alloc(xfs_bmap_free_item_zone,
2466 GFP_KERNEL | __GFP_NOFAIL);
2467 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2468 new->xefi_blockcount = 1;
2469 new->xefi_oinfo = *oinfo;
2470 new->xefi_skip_discard = false;
2472 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2474 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2479 * Check if an AGF has a free extent record whose length is equal to
2483 xfs_exact_minlen_extent_available(
2484 struct xfs_alloc_arg *args,
2485 struct xfs_buf *agbp,
2488 struct xfs_btree_cur *cnt_cur;
2493 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2494 args->agno, XFS_BTNUM_CNT);
2495 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2500 error = -EFSCORRUPTED;
2504 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2508 if (*stat == 1 && flen != args->minlen)
2512 xfs_btree_del_cursor(cnt_cur, error);
2519 * Decide whether to use this allocation group for this allocation.
2520 * If so, fix up the btree freelist's size.
2523 xfs_alloc_fix_freelist(
2524 struct xfs_alloc_arg *args, /* allocation argument structure */
2525 int flags) /* XFS_ALLOC_FLAG_... */
2527 struct xfs_mount *mp = args->mp;
2528 struct xfs_perag *pag = args->pag;
2529 struct xfs_trans *tp = args->tp;
2530 struct xfs_buf *agbp = NULL;
2531 struct xfs_buf *agflbp = NULL;
2532 struct xfs_alloc_arg targs; /* local allocation arguments */
2533 xfs_agblock_t bno; /* freelist block */
2534 xfs_extlen_t need; /* total blocks needed in freelist */
2537 /* deferred ops (AGFL block frees) require permanent transactions */
2538 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2540 if (!pag->pagf_init) {
2541 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2543 /* Couldn't lock the AGF so skip this AG. */
2544 if (error == -EAGAIN)
2551 * If this is a metadata preferred pag and we are user data then try
2552 * somewhere else if we are not being asked to try harder at this
2555 if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2556 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2557 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2558 goto out_agbp_relse;
2561 need = xfs_alloc_min_freelist(mp, pag);
2562 if (!xfs_alloc_space_available(args, need, flags |
2563 XFS_ALLOC_FLAG_CHECK))
2564 goto out_agbp_relse;
2567 * Get the a.g. freespace buffer.
2568 * Can fail if we're not blocking on locks, and it's held.
2571 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2573 /* Couldn't lock the AGF so skip this AG. */
2574 if (error == -EAGAIN)
2580 /* reset a padding mismatched agfl before final free space check */
2581 if (pag->pagf_agflreset)
2582 xfs_agfl_reset(tp, agbp, pag);
2584 /* If there isn't enough total space or single-extent, reject it. */
2585 need = xfs_alloc_min_freelist(mp, pag);
2586 if (!xfs_alloc_space_available(args, need, flags))
2587 goto out_agbp_relse;
2590 if (args->alloc_minlen_only) {
2593 error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2595 goto out_agbp_relse;
2599 * Make the freelist shorter if it's too long.
2601 * Note that from this point onwards, we will always release the agf and
2602 * agfl buffers on error. This handles the case where we error out and
2603 * the buffers are clean or may not have been joined to the transaction
2604 * and hence need to be released manually. If they have been joined to
2605 * the transaction, then xfs_trans_brelse() will handle them
2606 * appropriately based on the recursion count and dirty state of the
2609 * XXX (dgc): When we have lots of free space, does this buy us
2610 * anything other than extra overhead when we need to put more blocks
2611 * back on the free list? Maybe we should only do this when space is
2612 * getting low or the AGFL is more than half full?
2614 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2615 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2616 * updating the rmapbt. Both flags are used in xfs_repair while we're
2617 * rebuilding the rmapbt, and neither are used by the kernel. They're
2618 * both required to ensure that rmaps are correctly recorded for the
2619 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2620 * repair/rmap.c in xfsprogs for details.
2622 memset(&targs, 0, sizeof(targs));
2623 /* struct copy below */
2624 if (flags & XFS_ALLOC_FLAG_NORMAP)
2625 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2627 targs.oinfo = XFS_RMAP_OINFO_AG;
2628 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2629 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2631 goto out_agbp_relse;
2633 /* defer agfl frees */
2634 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2640 targs.agno = args->agno;
2641 targs.alignment = targs.minlen = targs.prod = 1;
2642 targs.type = XFS_ALLOCTYPE_THIS_AG;
2644 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2646 goto out_agbp_relse;
2648 /* Make the freelist longer if it's too short. */
2649 while (pag->pagf_flcount < need) {
2651 targs.maxlen = need - pag->pagf_flcount;
2652 targs.resv = XFS_AG_RESV_AGFL;
2654 /* Allocate as many blocks as possible at once. */
2655 error = xfs_alloc_ag_vextent(&targs);
2657 goto out_agflbp_relse;
2660 * Stop if we run out. Won't happen if callers are obeying
2661 * the restrictions correctly. Can happen for free calls
2662 * on a completely full ag.
2664 if (targs.agbno == NULLAGBLOCK) {
2665 if (flags & XFS_ALLOC_FLAG_FREEING)
2667 goto out_agflbp_relse;
2670 * Put each allocated block on the list.
2672 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2673 error = xfs_alloc_put_freelist(tp, agbp,
2676 goto out_agflbp_relse;
2679 xfs_trans_brelse(tp, agflbp);
2684 xfs_trans_brelse(tp, agflbp);
2687 xfs_trans_brelse(tp, agbp);
2694 * Get a block from the freelist.
2695 * Returns with the buffer for the block gotten.
2698 xfs_alloc_get_freelist(
2699 xfs_trans_t *tp, /* transaction pointer */
2700 struct xfs_buf *agbp, /* buffer containing the agf structure */
2701 xfs_agblock_t *bnop, /* block address retrieved from freelist */
2702 int btreeblk) /* destination is a AGF btree */
2704 struct xfs_agf *agf = agbp->b_addr;
2705 struct xfs_buf *agflbp;/* buffer for a.g. freelist structure */
2706 xfs_agblock_t bno; /* block number returned */
2710 xfs_mount_t *mp = tp->t_mountp;
2711 xfs_perag_t *pag; /* per allocation group data */
2714 * Freelist is empty, give up.
2716 if (!agf->agf_flcount) {
2717 *bnop = NULLAGBLOCK;
2721 * Read the array of free blocks.
2723 error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2730 * Get the block number and update the data structures.
2732 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2733 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2734 be32_add_cpu(&agf->agf_flfirst, 1);
2735 xfs_trans_brelse(tp, agflbp);
2736 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2737 agf->agf_flfirst = 0;
2740 ASSERT(!pag->pagf_agflreset);
2741 be32_add_cpu(&agf->agf_flcount, -1);
2742 xfs_trans_agflist_delta(tp, -1);
2743 pag->pagf_flcount--;
2745 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2747 be32_add_cpu(&agf->agf_btreeblks, 1);
2748 pag->pagf_btreeblks++;
2749 logflags |= XFS_AGF_BTREEBLKS;
2752 xfs_alloc_log_agf(tp, agbp, logflags);
2759 * Log the given fields from the agf structure.
2763 xfs_trans_t *tp, /* transaction pointer */
2764 struct xfs_buf *bp, /* buffer for a.g. freelist header */
2765 int fields) /* mask of fields to be logged (XFS_AGF_...) */
2767 int first; /* first byte offset */
2768 int last; /* last byte offset */
2769 static const short offsets[] = {
2770 offsetof(xfs_agf_t, agf_magicnum),
2771 offsetof(xfs_agf_t, agf_versionnum),
2772 offsetof(xfs_agf_t, agf_seqno),
2773 offsetof(xfs_agf_t, agf_length),
2774 offsetof(xfs_agf_t, agf_roots[0]),
2775 offsetof(xfs_agf_t, agf_levels[0]),
2776 offsetof(xfs_agf_t, agf_flfirst),
2777 offsetof(xfs_agf_t, agf_fllast),
2778 offsetof(xfs_agf_t, agf_flcount),
2779 offsetof(xfs_agf_t, agf_freeblks),
2780 offsetof(xfs_agf_t, agf_longest),
2781 offsetof(xfs_agf_t, agf_btreeblks),
2782 offsetof(xfs_agf_t, agf_uuid),
2783 offsetof(xfs_agf_t, agf_rmap_blocks),
2784 offsetof(xfs_agf_t, agf_refcount_blocks),
2785 offsetof(xfs_agf_t, agf_refcount_root),
2786 offsetof(xfs_agf_t, agf_refcount_level),
2787 /* needed so that we don't log the whole rest of the structure: */
2788 offsetof(xfs_agf_t, agf_spare64),
2792 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2794 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2796 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2797 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2801 * Interface for inode allocation to force the pag data to be initialized.
2804 xfs_alloc_pagf_init(
2805 xfs_mount_t *mp, /* file system mount structure */
2806 xfs_trans_t *tp, /* transaction pointer */
2807 xfs_agnumber_t agno, /* allocation group number */
2808 int flags) /* XFS_ALLOC_FLAGS_... */
2813 error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
2815 xfs_trans_brelse(tp, bp);
2820 * Put the block on the freelist for the allocation group.
2823 xfs_alloc_put_freelist(
2824 xfs_trans_t *tp, /* transaction pointer */
2825 struct xfs_buf *agbp, /* buffer for a.g. freelist header */
2826 struct xfs_buf *agflbp,/* buffer for a.g. free block array */
2827 xfs_agblock_t bno, /* block being freed */
2828 int btreeblk) /* block came from a AGF btree */
2830 struct xfs_mount *mp = tp->t_mountp;
2831 struct xfs_agf *agf = agbp->b_addr;
2832 __be32 *blockp;/* pointer to array entry */
2835 xfs_perag_t *pag; /* per allocation group data */
2839 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2840 be32_to_cpu(agf->agf_seqno), &agflbp)))
2842 be32_add_cpu(&agf->agf_fllast, 1);
2843 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2844 agf->agf_fllast = 0;
2847 ASSERT(!pag->pagf_agflreset);
2848 be32_add_cpu(&agf->agf_flcount, 1);
2849 xfs_trans_agflist_delta(tp, 1);
2850 pag->pagf_flcount++;
2852 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2854 be32_add_cpu(&agf->agf_btreeblks, -1);
2855 pag->pagf_btreeblks--;
2856 logflags |= XFS_AGF_BTREEBLKS;
2859 xfs_alloc_log_agf(tp, agbp, logflags);
2861 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2863 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2864 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2865 *blockp = cpu_to_be32(bno);
2866 startoff = (char *)blockp - (char *)agflbp->b_addr;
2868 xfs_alloc_log_agf(tp, agbp, logflags);
2870 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2871 xfs_trans_log_buf(tp, agflbp, startoff,
2872 startoff + sizeof(xfs_agblock_t) - 1);
2876 static xfs_failaddr_t
2880 struct xfs_mount *mp = bp->b_mount;
2881 struct xfs_agf *agf = bp->b_addr;
2883 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2884 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2885 return __this_address;
2886 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2887 return __this_address;
2890 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2891 return __this_address;
2893 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2894 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2895 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2896 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2897 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2898 return __this_address;
2900 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2901 return __this_address;
2903 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2904 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2905 return __this_address;
2907 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2908 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2909 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
2910 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
2911 return __this_address;
2913 if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2914 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2915 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
2916 return __this_address;
2918 if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2919 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2920 return __this_address;
2923 * during growfs operations, the perag is not fully initialised,
2924 * so we can't use it for any useful checking. growfs ensures we can't
2925 * use it by using uncached buffers that don't have the perag attached
2926 * so we can detect and avoid this problem.
2928 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2929 return __this_address;
2931 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2932 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2933 return __this_address;
2935 if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2936 be32_to_cpu(agf->agf_refcount_blocks) >
2937 be32_to_cpu(agf->agf_length))
2938 return __this_address;
2940 if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2941 (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2942 be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
2943 return __this_address;
2950 xfs_agf_read_verify(
2953 struct xfs_mount *mp = bp->b_mount;
2956 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2957 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2958 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2960 fa = xfs_agf_verify(bp);
2961 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2962 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2967 xfs_agf_write_verify(
2970 struct xfs_mount *mp = bp->b_mount;
2971 struct xfs_buf_log_item *bip = bp->b_log_item;
2972 struct xfs_agf *agf = bp->b_addr;
2975 fa = xfs_agf_verify(bp);
2977 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2981 if (!xfs_sb_version_hascrc(&mp->m_sb))
2985 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2987 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2990 const struct xfs_buf_ops xfs_agf_buf_ops = {
2992 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2993 .verify_read = xfs_agf_read_verify,
2994 .verify_write = xfs_agf_write_verify,
2995 .verify_struct = xfs_agf_verify,
2999 * Read in the allocation group header (free/alloc section).
3003 struct xfs_mount *mp, /* mount point structure */
3004 struct xfs_trans *tp, /* transaction pointer */
3005 xfs_agnumber_t agno, /* allocation group number */
3006 int flags, /* XFS_BUF_ */
3007 struct xfs_buf **bpp) /* buffer for the ag freelist header */
3011 trace_xfs_read_agf(mp, agno);
3013 ASSERT(agno != NULLAGNUMBER);
3014 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3015 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
3016 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
3020 ASSERT(!(*bpp)->b_error);
3021 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
3026 * Read in the allocation group header (free/alloc section).
3030 struct xfs_mount *mp, /* mount point structure */
3031 struct xfs_trans *tp, /* transaction pointer */
3032 xfs_agnumber_t agno, /* allocation group number */
3033 int flags, /* XFS_ALLOC_FLAG_... */
3034 struct xfs_buf **bpp) /* buffer for the ag freelist header */
3036 struct xfs_agf *agf; /* ag freelist header */
3037 struct xfs_perag *pag; /* per allocation group data */
3040 trace_xfs_alloc_read_agf(mp, agno);
3042 /* We don't support trylock when freeing. */
3043 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3044 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3045 ASSERT(agno != NULLAGNUMBER);
3046 error = xfs_read_agf(mp, tp, agno,
3047 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3051 ASSERT(!(*bpp)->b_error);
3053 agf = (*bpp)->b_addr;
3054 pag = (*bpp)->b_pag;
3055 if (!pag->pagf_init) {
3056 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3057 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3058 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3059 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3060 pag->pagf_levels[XFS_BTNUM_BNOi] =
3061 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3062 pag->pagf_levels[XFS_BTNUM_CNTi] =
3063 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3064 pag->pagf_levels[XFS_BTNUM_RMAPi] =
3065 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3066 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3068 pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
3071 else if (!XFS_FORCED_SHUTDOWN(mp)) {
3072 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3073 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3074 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3075 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3076 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3077 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3078 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3079 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3086 * Allocate an extent (variable-size).
3087 * Depending on the allocation type, we either look in a single allocation
3088 * group or loop over the allocation groups to find the result.
3092 struct xfs_alloc_arg *args) /* allocation argument structure */
3094 xfs_agblock_t agsize; /* allocation group size */
3096 int flags; /* XFS_ALLOC_FLAG_... locking flags */
3097 struct xfs_mount *mp; /* mount structure pointer */
3098 xfs_agnumber_t sagno; /* starting allocation group number */
3099 xfs_alloctype_t type; /* input allocation type */
3101 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3104 type = args->otype = args->type;
3105 args->agbno = NULLAGBLOCK;
3107 * Just fix this up, for the case where the last a.g. is shorter
3108 * (or there's only one a.g.) and the caller couldn't easily figure
3109 * that out (xfs_bmap_alloc).
3111 agsize = mp->m_sb.sb_agblocks;
3112 if (args->maxlen > agsize)
3113 args->maxlen = agsize;
3114 if (args->alignment == 0)
3115 args->alignment = 1;
3116 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3117 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3118 ASSERT(args->minlen <= args->maxlen);
3119 ASSERT(args->minlen <= agsize);
3120 ASSERT(args->mod < args->prod);
3121 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3122 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3123 args->minlen > args->maxlen || args->minlen > agsize ||
3124 args->mod >= args->prod) {
3125 args->fsbno = NULLFSBLOCK;
3126 trace_xfs_alloc_vextent_badargs(args);
3131 case XFS_ALLOCTYPE_THIS_AG:
3132 case XFS_ALLOCTYPE_NEAR_BNO:
3133 case XFS_ALLOCTYPE_THIS_BNO:
3135 * These three force us into a single a.g.
3137 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3138 args->pag = xfs_perag_get(mp, args->agno);
3139 error = xfs_alloc_fix_freelist(args, 0);
3141 trace_xfs_alloc_vextent_nofix(args);
3145 trace_xfs_alloc_vextent_noagbp(args);
3148 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3149 if ((error = xfs_alloc_ag_vextent(args)))
3152 case XFS_ALLOCTYPE_START_BNO:
3154 * Try near allocation first, then anywhere-in-ag after
3155 * the first a.g. fails.
3157 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3158 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
3159 args->fsbno = XFS_AGB_TO_FSB(mp,
3160 ((mp->m_agfrotor / rotorstep) %
3161 mp->m_sb.sb_agcount), 0);
3164 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3165 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3167 case XFS_ALLOCTYPE_FIRST_AG:
3169 * Rotate through the allocation groups looking for a winner.
3171 if (type == XFS_ALLOCTYPE_FIRST_AG) {
3173 * Start with allocation group given by bno.
3175 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3176 args->type = XFS_ALLOCTYPE_THIS_AG;
3181 * Start with the given allocation group.
3183 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3184 flags = XFS_ALLOC_FLAG_TRYLOCK;
3187 * Loop over allocation groups twice; first time with
3188 * trylock set, second time without.
3191 args->pag = xfs_perag_get(mp, args->agno);
3192 error = xfs_alloc_fix_freelist(args, flags);
3194 trace_xfs_alloc_vextent_nofix(args);
3198 * If we get a buffer back then the allocation will fly.
3201 if ((error = xfs_alloc_ag_vextent(args)))
3206 trace_xfs_alloc_vextent_loopfailed(args);
3209 * Didn't work, figure out the next iteration.
3211 if (args->agno == sagno &&
3212 type == XFS_ALLOCTYPE_START_BNO)
3213 args->type = XFS_ALLOCTYPE_THIS_AG;
3215 * For the first allocation, we can try any AG to get
3216 * space. However, if we already have allocated a
3217 * block, we don't want to try AGs whose number is below
3218 * sagno. Otherwise, we may end up with out-of-order
3219 * locking of AGF, which might cause deadlock.
3221 if (++(args->agno) == mp->m_sb.sb_agcount) {
3222 if (args->tp->t_firstblock != NULLFSBLOCK)
3228 * Reached the starting a.g., must either be done
3229 * or switch to non-trylock mode.
3231 if (args->agno == sagno) {
3233 args->agbno = NULLAGBLOCK;
3234 trace_xfs_alloc_vextent_allfailed(args);
3239 if (type == XFS_ALLOCTYPE_START_BNO) {
3240 args->agbno = XFS_FSB_TO_AGBNO(mp,
3242 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3245 xfs_perag_put(args->pag);
3248 if (args->agno == sagno)
3249 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3250 (mp->m_sb.sb_agcount * rotorstep);
3252 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3253 (mp->m_sb.sb_agcount * rotorstep);
3260 if (args->agbno == NULLAGBLOCK)
3261 args->fsbno = NULLFSBLOCK;
3263 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3265 ASSERT(args->len >= args->minlen);
3266 ASSERT(args->len <= args->maxlen);
3267 ASSERT(args->agbno % args->alignment == 0);
3268 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3273 xfs_perag_put(args->pag);
3276 xfs_perag_put(args->pag);
3280 /* Ensure that the freelist is at full capacity. */
3282 xfs_free_extent_fix_freelist(
3283 struct xfs_trans *tp,
3284 xfs_agnumber_t agno,
3285 struct xfs_buf **agbp)
3287 struct xfs_alloc_arg args;
3290 memset(&args, 0, sizeof(struct xfs_alloc_arg));
3292 args.mp = tp->t_mountp;
3296 * validate that the block number is legal - the enables us to detect
3297 * and handle a silent filesystem corruption rather than crashing.
3299 if (args.agno >= args.mp->m_sb.sb_agcount)
3300 return -EFSCORRUPTED;
3302 args.pag = xfs_perag_get(args.mp, args.agno);
3305 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3311 xfs_perag_put(args.pag);
3317 * Just break up the extent address and hand off to xfs_free_ag_extent
3318 * after fixing up the freelist.
3322 struct xfs_trans *tp,
3325 const struct xfs_owner_info *oinfo,
3326 enum xfs_ag_resv_type type,
3329 struct xfs_mount *mp = tp->t_mountp;
3330 struct xfs_buf *agbp;
3331 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
3332 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
3333 struct xfs_agf *agf;
3335 unsigned int busy_flags = 0;
3338 ASSERT(type != XFS_AG_RESV_AGFL);
3340 if (XFS_TEST_ERROR(false, mp,
3341 XFS_ERRTAG_FREE_EXTENT))
3344 error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
3349 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3350 error = -EFSCORRUPTED;
3354 /* validate the extent size is legal now we have the agf locked */
3355 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3356 error = -EFSCORRUPTED;
3360 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3365 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3366 xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
3370 xfs_trans_brelse(tp, agbp);
3374 struct xfs_alloc_query_range_info {
3375 xfs_alloc_query_range_fn fn;
3379 /* Format btree record and pass to our callback. */
3381 xfs_alloc_query_range_helper(
3382 struct xfs_btree_cur *cur,
3383 union xfs_btree_rec *rec,
3386 struct xfs_alloc_query_range_info *query = priv;
3387 struct xfs_alloc_rec_incore irec;
3389 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3390 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3391 return query->fn(cur, &irec, query->priv);
3394 /* Find all free space within a given range of blocks. */
3396 xfs_alloc_query_range(
3397 struct xfs_btree_cur *cur,
3398 struct xfs_alloc_rec_incore *low_rec,
3399 struct xfs_alloc_rec_incore *high_rec,
3400 xfs_alloc_query_range_fn fn,
3403 union xfs_btree_irec low_brec;
3404 union xfs_btree_irec high_brec;
3405 struct xfs_alloc_query_range_info query;
3407 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3408 low_brec.a = *low_rec;
3409 high_brec.a = *high_rec;
3412 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3413 xfs_alloc_query_range_helper, &query);
3416 /* Find all free space records. */
3418 xfs_alloc_query_all(
3419 struct xfs_btree_cur *cur,
3420 xfs_alloc_query_range_fn fn,
3423 struct xfs_alloc_query_range_info query;
3425 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3428 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3431 /* Is there a record covering a given extent? */
3433 xfs_alloc_has_record(
3434 struct xfs_btree_cur *cur,
3439 union xfs_btree_irec low;
3440 union xfs_btree_irec high;
3442 memset(&low, 0, sizeof(low));
3443 low.a.ar_startblock = bno;
3444 memset(&high, 0xFF, sizeof(high));
3445 high.a.ar_startblock = bno + len - 1;
3447 return xfs_btree_has_record(cur, &low, &high, exists);
3451 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
3452 * error code or XFS_ITER_*.
3456 struct xfs_mount *mp,
3457 struct xfs_agf *agf,
3458 struct xfs_buf *agflbp,
3459 xfs_agfl_walk_fn walk_fn,
3466 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3467 i = be32_to_cpu(agf->agf_flfirst);
3469 /* Nothing to walk in an empty AGFL. */
3470 if (agf->agf_flcount == cpu_to_be32(0))
3473 /* Otherwise, walk from first to last, wrapping as needed. */
3475 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3478 if (i == be32_to_cpu(agf->agf_fllast))
3480 if (++i == xfs_agfl_size(mp))