2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/blkdev.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/crc32.h>
30 #include "trace_gfs2.h"
32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
37 struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
38 __u16 mp_list[GFS2_MAX_META_HEIGHT];
42 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
44 * @dibh: the dinode buffer
45 * @block: the block number that was allocated
46 * @page: The (optional) page. This is looked up if @page is NULL
51 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
52 u64 block, struct page *page)
54 struct inode *inode = &ip->i_inode;
55 struct buffer_head *bh;
58 if (!page || page->index) {
59 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
65 if (!PageUptodate(page)) {
66 void *kaddr = kmap(page);
67 u64 dsize = i_size_read(inode);
69 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
70 dsize = dibh->b_size - sizeof(struct gfs2_dinode);
72 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
73 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
76 SetPageUptodate(page);
79 if (!page_has_buffers(page))
80 create_empty_buffers(page, BIT(inode->i_blkbits),
83 bh = page_buffers(page);
85 if (!buffer_mapped(bh))
86 map_bh(bh, inode->i_sb, block);
88 set_buffer_uptodate(bh);
89 if (!gfs2_is_jdata(ip))
90 mark_buffer_dirty(bh);
91 if (!gfs2_is_writeback(ip))
92 gfs2_trans_add_data(ip->i_gl, bh);
103 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
104 * @ip: The GFS2 inode to unstuff
105 * @page: The (optional) page. This is looked up if the @page is NULL
107 * This routine unstuffs a dinode and returns it to a "normal" state such
108 * that the height can be grown in the traditional way.
113 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
115 struct buffer_head *bh, *dibh;
116 struct gfs2_dinode *di;
118 int isdir = gfs2_is_dir(ip);
121 down_write(&ip->i_rw_mutex);
123 error = gfs2_meta_inode_buffer(ip, &dibh);
127 if (i_size_read(&ip->i_inode)) {
128 /* Get a free block, fill it with the stuffed data,
129 and write it out to disk */
132 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
136 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
137 error = gfs2_dir_get_new_buffer(ip, block, &bh);
140 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
141 dibh, sizeof(struct gfs2_dinode));
144 error = gfs2_unstuffer_page(ip, dibh, block, page);
150 /* Set up the pointer to the new block */
152 gfs2_trans_add_meta(ip->i_gl, dibh);
153 di = (struct gfs2_dinode *)dibh->b_data;
154 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
156 if (i_size_read(&ip->i_inode)) {
157 *(__be64 *)(di + 1) = cpu_to_be64(block);
158 gfs2_add_inode_blocks(&ip->i_inode, 1);
159 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
163 di->di_height = cpu_to_be16(1);
168 up_write(&ip->i_rw_mutex);
174 * find_metapath - Find path through the metadata tree
175 * @sdp: The superblock
176 * @mp: The metapath to return the result in
177 * @block: The disk block to look up
178 * @height: The pre-calculated height of the metadata tree
180 * This routine returns a struct metapath structure that defines a path
181 * through the metadata of inode "ip" to get to block "block".
184 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
185 * filesystem with a blocksize of 4096.
187 * find_metapath() would return a struct metapath structure set to:
188 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
189 * and mp_list[2] = 165.
191 * That means that in order to get to the block containing the byte at
192 * offset 101342453, we would load the indirect block pointed to by pointer
193 * 0 in the dinode. We would then load the indirect block pointed to by
194 * pointer 48 in that indirect block. We would then load the data block
195 * pointed to by pointer 165 in that indirect block.
197 * ----------------------------------------
202 * ----------------------------------------
206 * ----------------------------------------
210 * |0 5 6 7 8 9 0 1 2|
211 * ----------------------------------------
215 * ----------------------------------------
220 * ----------------------------------------
224 * ----------------------------------------
225 * | Data block containing offset |
229 * ----------------------------------------
233 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
234 struct metapath *mp, unsigned int height)
238 for (i = height; i--;)
239 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
243 static inline unsigned int metapath_branch_start(const struct metapath *mp)
245 if (mp->mp_list[0] == 0)
251 * metaptr1 - Return the first possible metadata pointer in a metaath buffer
252 * @height: The metadata height (0 = dinode)
255 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
257 struct buffer_head *bh = mp->mp_bh[height];
259 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
260 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
264 * metapointer - Return pointer to start of metadata in a buffer
265 * @height: The metadata height (0 = dinode)
268 * Return a pointer to the block number of the next height of the metadata
269 * tree given a buffer containing the pointer to the current height of the
273 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
275 __be64 *p = metaptr1(height, mp);
276 return p + mp->mp_list[height];
279 static void gfs2_metapath_ra(struct gfs2_glock *gl,
280 const struct buffer_head *bh, const __be64 *pos)
282 struct buffer_head *rabh;
283 const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
286 for (t = pos; t < endp; t++) {
290 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
291 if (trylock_buffer(rabh)) {
292 if (!buffer_uptodate(rabh)) {
293 rabh->b_end_io = end_buffer_read_sync;
294 submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META,
305 * lookup_mp_height - helper function for lookup_metapath
308 * @h: the height which needs looking up
310 static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
312 __be64 *ptr = metapointer(h, mp);
313 u64 dblock = be64_to_cpu(*ptr);
318 return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
322 * lookup_metapath - Walk the metadata tree to a specific point
326 * Assumes that the inode's buffer has already been looked up and
327 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
328 * by find_metapath().
330 * If this function encounters part of the tree which has not been
331 * allocated, it returns the current height of the tree at the point
332 * at which it found the unallocated block. Blocks which are found are
333 * added to the mp->mp_bh[] list.
335 * Returns: error or height of metadata tree
338 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
340 unsigned int end_of_metadata = ip->i_height - 1;
344 for (x = 0; x < end_of_metadata; x++) {
345 ret = lookup_mp_height(ip, mp, x);
354 * fillup_metapath - fill up buffers for the metadata path to a specific height
357 * @h: The height to which it should be mapped
359 * Similar to lookup_metapath, but does lookups for a range of heights
361 * Returns: error or height of metadata tree
364 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
366 unsigned int start_h = h - 1;
370 /* find the first buffer we need to look up. */
371 while (start_h > 0 && mp->mp_bh[start_h] == NULL)
373 for (; start_h < h; start_h++) {
374 ret = lookup_mp_height(ip, mp, start_h);
382 static inline void release_metapath(struct metapath *mp)
386 for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
387 if (mp->mp_bh[i] == NULL)
389 brelse(mp->mp_bh[i]);
394 * gfs2_extent_length - Returns length of an extent of blocks
395 * @start: Start of the buffer
396 * @len: Length of the buffer in bytes
397 * @ptr: Current position in the buffer
398 * @limit: Max extent length to return (0 = unlimited)
399 * @eob: Set to 1 if we hit "end of block"
401 * If the first block is zero (unallocated) it will return the number of
402 * unallocated blocks in the extent, otherwise it will return the number
403 * of contiguous blocks in the extent.
405 * Returns: The length of the extent (minimum of one block)
408 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
410 const __be64 *end = (start + len);
411 const __be64 *first = ptr;
412 u64 d = be64_to_cpu(*ptr);
419 if (limit && --limit == 0)
423 } while(be64_to_cpu(*ptr) == d);
426 return (ptr - first);
429 static inline void bmap_lock(struct gfs2_inode *ip, int create)
432 down_write(&ip->i_rw_mutex);
434 down_read(&ip->i_rw_mutex);
437 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
440 up_write(&ip->i_rw_mutex);
442 up_read(&ip->i_rw_mutex);
445 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
446 struct gfs2_glock *gl, unsigned int i,
447 unsigned offset, u64 bn)
449 __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
450 ((i > 1) ? sizeof(struct gfs2_meta_header) :
451 sizeof(struct gfs2_dinode)));
453 BUG_ON(mp->mp_bh[i] != NULL);
454 mp->mp_bh[i] = gfs2_meta_new(gl, bn);
455 gfs2_trans_add_meta(gl, mp->mp_bh[i]);
456 gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
457 gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
459 *ptr = cpu_to_be64(bn);
465 ALLOC_GROW_DEPTH = 1,
466 ALLOC_GROW_HEIGHT = 2,
467 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */
470 static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
473 return sdp->sd_inptrs;
474 return sdp->sd_diptrs;
478 * gfs2_bmap_alloc - Build a metadata tree of the requested height
479 * @inode: The GFS2 inode
480 * @lblock: The logical starting block of the extent
481 * @bh_map: This is used to return the mapping details
483 * @sheight: The starting height (i.e. whats already mapped)
484 * @height: The height to build to
485 * @maxlen: The max number of data blocks to alloc
487 * In this routine we may have to alloc:
488 * i) Indirect blocks to grow the metadata tree height
489 * ii) Indirect blocks to fill in lower part of the metadata tree
492 * The function is in two parts. The first part works out the total
493 * number of blocks which we need. The second part does the actual
494 * allocation asking for an extent at a time (if enough contiguous free
495 * blocks are available, there will only be one request per bmap call)
496 * and uses the state machine to initialise the blocks in order.
498 * Returns: errno on error
501 static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
502 struct buffer_head *bh_map, struct metapath *mp,
503 const unsigned int sheight,
504 const unsigned int height,
507 struct gfs2_inode *ip = GFS2_I(inode);
508 struct gfs2_sbd *sdp = GFS2_SB(inode);
509 struct super_block *sb = sdp->sd_vfs;
510 struct buffer_head *dibh = mp->mp_bh[0];
512 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
514 unsigned ptrs_per_blk;
515 const unsigned end_of_metadata = height - 1;
518 enum alloc_state state;
523 BUG_ON(dibh == NULL);
525 gfs2_trans_add_meta(ip->i_gl, dibh);
527 if (height == sheight) {
528 struct buffer_head *bh;
529 /* Bottom indirect block exists, find unalloced extent size */
530 ptr = metapointer(end_of_metadata, mp);
531 bh = mp->mp_bh[end_of_metadata];
532 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
537 /* Need to allocate indirect blocks */
538 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
539 dblks = min(maxlen, (size_t)(ptrs_per_blk -
540 mp->mp_list[end_of_metadata]));
541 if (height == ip->i_height) {
542 /* Writing into existing tree, extend tree down */
543 iblks = height - sheight;
544 state = ALLOC_GROW_DEPTH;
546 /* Building up tree height */
547 state = ALLOC_GROW_HEIGHT;
548 iblks = height - ip->i_height;
549 branch_start = metapath_branch_start(mp);
550 iblks += (height - branch_start);
554 /* start of the second part of the function (state machine) */
556 blks = dblks + iblks;
561 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
565 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
566 gfs2_trans_add_unrevoke(sdp, bn, n);
568 /* Growing height of tree */
569 case ALLOC_GROW_HEIGHT:
571 ptr = (__be64 *)(dibh->b_data +
572 sizeof(struct gfs2_dinode));
575 for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
576 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
577 if (i - 1 == height - ip->i_height) {
579 gfs2_buffer_copy_tail(mp->mp_bh[i],
580 sizeof(struct gfs2_meta_header),
581 dibh, sizeof(struct gfs2_dinode));
582 gfs2_buffer_clear_tail(dibh,
583 sizeof(struct gfs2_dinode) +
585 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
586 sizeof(struct gfs2_meta_header));
588 state = ALLOC_GROW_DEPTH;
589 for(i = branch_start; i < height; i++) {
590 if (mp->mp_bh[i] == NULL)
592 brelse(mp->mp_bh[i]);
599 /* Branching from existing tree */
600 case ALLOC_GROW_DEPTH:
601 if (i > 1 && i < height)
602 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
603 for (; i < height && n > 0; i++, n--)
604 gfs2_indirect_init(mp, ip->i_gl, i,
605 mp->mp_list[i-1], bn++);
610 /* Tree complete, adding data blocks */
613 BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
614 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
616 ptr = metapointer(end_of_metadata, mp);
619 *ptr++ = cpu_to_be64(bn++);
620 if (buffer_zeronew(bh_map)) {
621 ret = sb_issue_zeroout(sb, dblock, dblks,
625 "Failed to zero data buffers\n");
626 clear_buffer_zeronew(bh_map);
631 } while ((state != ALLOC_DATA) || !dblock);
633 ip->i_height = height;
634 gfs2_add_inode_blocks(&ip->i_inode, alloced);
635 gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
636 map_bh(bh_map, inode->i_sb, dblock);
637 bh_map->b_size = dblks << inode->i_blkbits;
638 set_buffer_new(bh_map);
643 * gfs2_block_map - Map a block from an inode to a disk block
645 * @lblock: The logical block number
646 * @bh_map: The bh to be mapped
647 * @create: True if its ok to alloc blocks to satify the request
649 * Sets buffer_mapped() if successful, sets buffer_boundary() if a
650 * read of metadata will be required before the next block can be
651 * mapped. Sets buffer_new() if new blocks were allocated.
656 int gfs2_block_map(struct inode *inode, sector_t lblock,
657 struct buffer_head *bh_map, int create)
659 struct gfs2_inode *ip = GFS2_I(inode);
660 struct gfs2_sbd *sdp = GFS2_SB(inode);
661 unsigned int bsize = sdp->sd_sb.sb_bsize;
662 const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
663 const u64 *arr = sdp->sd_heightsize;
670 struct buffer_head *bh;
675 memset(&mp, 0, sizeof(mp));
676 bmap_lock(ip, create);
677 clear_buffer_mapped(bh_map);
678 clear_buffer_new(bh_map);
679 clear_buffer_boundary(bh_map);
680 trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
681 if (gfs2_is_dir(ip)) {
682 bsize = sdp->sd_jbsize;
683 arr = sdp->sd_jheightsize;
686 ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
690 height = ip->i_height;
691 size = (lblock + 1) * bsize;
692 while (size > arr[height])
694 find_metapath(sdp, lblock, &mp, height);
696 if (height > ip->i_height || gfs2_is_stuffed(ip))
698 ret = lookup_metapath(ip, &mp);
701 if (ret != ip->i_height)
703 ptr = metapointer(ip->i_height - 1, &mp);
706 map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
707 bh = mp.mp_bh[ip->i_height - 1];
708 len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
709 bh_map->b_size = (len << inode->i_blkbits);
711 set_buffer_boundary(bh_map);
714 release_metapath(&mp);
715 trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
716 bmap_unlock(ip, create);
720 /* All allocations are done here, firstly check create flag */
722 BUG_ON(gfs2_is_stuffed(ip));
727 /* At this point ret is the tree depth of already allocated blocks */
728 ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
733 * Deprecated: do not use in new code
735 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
737 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
745 bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
746 ret = gfs2_block_map(inode, lblock, &bh, create);
747 *extlen = bh.b_size >> inode->i_blkbits;
748 *dblock = bh.b_blocknr;
757 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
759 * This is partly borrowed from ext3.
761 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
763 struct inode *inode = mapping->host;
764 struct gfs2_inode *ip = GFS2_I(inode);
765 unsigned long index = from >> PAGE_SHIFT;
766 unsigned offset = from & (PAGE_SIZE-1);
767 unsigned blocksize, iblock, length, pos;
768 struct buffer_head *bh;
772 page = find_or_create_page(mapping, index, GFP_NOFS);
776 blocksize = inode->i_sb->s_blocksize;
777 length = blocksize - (offset & (blocksize - 1));
778 iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
780 if (!page_has_buffers(page))
781 create_empty_buffers(page, blocksize, 0);
783 /* Find the buffer that contains "offset" */
784 bh = page_buffers(page);
786 while (offset >= pos) {
787 bh = bh->b_this_page;
794 if (!buffer_mapped(bh)) {
795 gfs2_block_map(inode, iblock, bh, 0);
796 /* unmapped? It's a hole - nothing to do */
797 if (!buffer_mapped(bh))
801 /* Ok, it's mapped. Make sure it's up-to-date */
802 if (PageUptodate(page))
803 set_buffer_uptodate(bh);
805 if (!buffer_uptodate(bh)) {
807 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
809 /* Uhhuh. Read error. Complain and punt. */
810 if (!buffer_uptodate(bh))
815 if (!gfs2_is_writeback(ip))
816 gfs2_trans_add_data(ip->i_gl, bh);
818 zero_user(page, offset, length);
819 mark_buffer_dirty(bh);
826 #define GFS2_JTRUNC_REVOKES 8192
829 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
830 * @inode: The inode being truncated
831 * @oldsize: The original (larger) size
832 * @newsize: The new smaller size
834 * With jdata files, we have to journal a revoke for each block which is
835 * truncated. As a result, we need to split this into separate transactions
836 * if the number of pages being truncated gets too large.
839 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
841 struct gfs2_sbd *sdp = GFS2_SB(inode);
842 u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
846 while (oldsize != newsize) {
847 chunk = oldsize - newsize;
848 if (chunk > max_chunk)
850 truncate_pagecache(inode, oldsize - chunk);
853 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
861 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
863 struct gfs2_inode *ip = GFS2_I(inode);
864 struct gfs2_sbd *sdp = GFS2_SB(inode);
865 struct address_space *mapping = inode->i_mapping;
866 struct buffer_head *dibh;
867 int journaled = gfs2_is_jdata(ip);
871 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
873 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
877 error = gfs2_meta_inode_buffer(ip, &dibh);
881 gfs2_trans_add_meta(ip->i_gl, dibh);
883 if (gfs2_is_stuffed(ip)) {
884 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
886 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
887 error = gfs2_block_truncate_page(mapping, newsize);
891 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
894 i_size_write(inode, newsize);
895 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
896 gfs2_dinode_out(ip, dibh->b_data);
899 error = gfs2_journaled_truncate(inode, oldsize, newsize);
901 truncate_pagecache(inode, newsize);
916 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
918 * @rg_gh: holder of resource group glock
919 * @mp: current metapath fully populated with buffers
920 * @btotal: place to keep count of total blocks freed
921 * @hgt: height we're processing
922 * @first: true if this is the first call to this function for this height
924 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
925 * free, and free them all. However, we do it one rgrp at a time. If this
926 * block has references to multiple rgrps, we break it into individual
927 * transactions. This allows other processes to use the rgrps while we're
928 * focused on a single one, for better concurrency / performance.
929 * At every transaction boundary, we rewrite the inode into the journal.
930 * That way the bitmaps are kept consistent with the inode and we can recover
931 * if we're interrupted by power-outages.
933 * Returns: 0, or return code if an error occurred.
934 * *btotal has the total number of blocks freed
936 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
937 const struct metapath *mp, u32 *btotal, int hgt,
940 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
941 struct gfs2_rgrpd *rgd;
942 struct gfs2_trans *tr;
943 struct buffer_head *bh = mp->mp_bh[hgt];
944 __be64 *top, *bottom, *p;
945 int blks_outside_rgrp;
946 u64 bn, bstart, isize_blks;
947 s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
948 int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
950 bool buf_in_tr = false; /* buffer was added to transaction */
952 if (gfs2_metatype_check(sdp, bh,
953 (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
957 blks_outside_rgrp = 0;
960 top = metapointer(hgt, mp); /* first ptr from metapath */
961 /* If we're keeping some data at the truncation point, we've got to
962 preserve the metadata tree by adding 1 to the starting metapath. */
966 bottom = (__be64 *)(bh->b_data + bh->b_size);
968 for (p = top; p < bottom; p++) {
971 bn = be64_to_cpu(*p);
972 if (gfs2_holder_initialized(rd_gh)) {
973 rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
974 gfs2_assert_withdraw(sdp,
975 gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
977 rgd = gfs2_blk2rgrpd(sdp, bn, false);
978 ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
983 /* Must be done with the rgrp glock held: */
984 if (gfs2_rs_active(&ip->i_res) &&
985 rgd == ip->i_res.rs_rbm.rgd)
986 gfs2_rs_deltree(&ip->i_res);
989 if (!rgrp_contains_block(rgd, bn)) {
994 /* The size of our transactions will be unknown until we
995 actually process all the metadata blocks that relate to
996 the rgrp. So we estimate. We know it can't be more than
997 the dinode's i_blocks and we don't want to exceed the
998 journal flush threshold, sd_log_thresh2. */
999 if (current->journal_info == NULL) {
1000 unsigned int jblocks_rqsted, revokes;
1002 jblocks_rqsted = rgd->rd_length + RES_DINODE +
1004 isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1005 if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1007 atomic_read(&sdp->sd_log_thresh2);
1009 jblocks_rqsted += isize_blks;
1010 revokes = jblocks_rqsted;
1012 revokes += hptrs(sdp, hgt);
1013 else if (ip->i_depth)
1014 revokes += sdp->sd_inptrs;
1015 ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1018 down_write(&ip->i_rw_mutex);
1020 /* check if we will exceed the transaction blocks requested */
1021 tr = current->journal_info;
1022 if (tr->tr_num_buf_new + RES_STATFS +
1023 RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1024 /* We set blks_outside_rgrp to ensure the loop will
1025 be repeated for the same rgrp, but with a new
1027 blks_outside_rgrp++;
1028 /* This next part is tricky. If the buffer was added
1029 to the transaction, we've already set some block
1030 pointers to 0, so we better follow through and free
1031 them, or we will introduce corruption (so break).
1032 This may be impossible, or at least rare, but I
1033 decided to cover the case regardless.
1035 If the buffer was not added to the transaction
1036 (this call), doing so would exceed our transaction
1037 size, so we need to end the transaction and start a
1038 new one (so goto). */
1045 gfs2_trans_add_meta(ip->i_gl, bh);
1048 if (bstart + blen == bn) {
1053 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1055 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1061 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1063 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1066 if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1067 outside the rgrp we just processed,
1068 do it all over again. */
1069 if (current->journal_info) {
1070 struct buffer_head *dibh = mp->mp_bh[0];
1072 /* Every transaction boundary, we rewrite the dinode
1073 to keep its di_blocks current in case of failure. */
1074 ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1075 current_time(&ip->i_inode);
1076 gfs2_trans_add_meta(ip->i_gl, dibh);
1077 gfs2_dinode_out(ip, dibh->b_data);
1078 up_write(&ip->i_rw_mutex);
1079 gfs2_trans_end(sdp);
1081 gfs2_glock_dq_uninit(rd_gh);
1090 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1091 * assumes the metapath is valid (with buffers) out to height h
1092 * @mp: starting metapath
1093 * @h: desired height to search
1095 * Returns: true if a non-null pointer was found in the metapath buffer
1096 * false if all remaining pointers are NULL in the buffer
1098 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1102 unsigned int ptrs = hptrs(sdp, h) - 1;
1105 ptr = metapointer(h, mp);
1106 if (*ptr) /* if we have a non-null pointer */
1109 if (mp->mp_list[h] < ptrs)
1112 return false; /* no more pointers in this buffer */
1116 enum dealloc_states {
1117 DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
1118 DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
1119 DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
1120 DEALLOC_DONE = 3, /* process complete */
1124 * trunc_dealloc - truncate a file down to a desired size
1125 * @ip: inode to truncate
1126 * @newsize: The desired size of the file
1128 * This function truncates a file to newsize. It works from the
1129 * bottom up, and from the right to the left. In other words, it strips off
1130 * the highest layer (data) before stripping any of the metadata. Doing it
1131 * this way is best in case the operation is interrupted by power failure, etc.
1132 * The dinode is rewritten in every transaction to guarantee integrity.
1134 static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1136 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1138 struct buffer_head *dibh, *bh;
1139 struct gfs2_holder rd_gh;
1141 __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1142 unsigned int strip_h = ip->i_height - 1;
1145 int mp_h; /* metapath buffers are read in to this height */
1146 sector_t last_ra = 0;
1148 bool preserve1; /* need to preserve the first meta pointer? */
1153 lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1155 memset(&mp, 0, sizeof(mp));
1156 find_metapath(sdp, lblock, &mp, ip->i_height);
1158 memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1160 ret = gfs2_meta_inode_buffer(ip, &dibh);
1165 ret = lookup_metapath(ip, &mp);
1166 if (ret == ip->i_height)
1167 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1169 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1171 ret = gfs2_rindex_update(sdp);
1175 ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1178 gfs2_holder_mark_uninitialized(&rd_gh);
1182 while (state != DEALLOC_DONE) {
1184 /* Truncate a full metapath at the given strip height.
1185 * Note that strip_h == mp_h in order to be in this state. */
1186 case DEALLOC_MP_FULL:
1187 if (mp_h > 0) { /* issue read-ahead on metadata */
1190 bh = mp.mp_bh[mp_h - 1];
1191 if (bh->b_blocknr != last_ra) {
1192 last_ra = bh->b_blocknr;
1193 top = metaptr1(mp_h - 1, &mp);
1194 gfs2_metapath_ra(ip->i_gl, bh, top);
1197 /* If we're truncating to a non-zero size and the mp is
1198 at the beginning of file for the strip height, we
1199 need to preserve the first metadata pointer. */
1200 preserve1 = (newsize &&
1201 (mp.mp_list[mp_h] == nbof[mp_h]));
1202 bh = mp.mp_bh[mp_h];
1203 gfs2_assert_withdraw(sdp, bh);
1204 if (gfs2_assert_withdraw(sdp,
1205 prev_bnr != bh->b_blocknr)) {
1206 printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1207 "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1209 (unsigned long long)ip->i_no_addr,
1210 prev_bnr, ip->i_height, strip_h, mp_h);
1212 prev_bnr = bh->b_blocknr;
1213 ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1215 /* If we hit an error or just swept dinode buffer,
1218 state = DEALLOC_DONE;
1221 state = DEALLOC_MP_LOWER;
1224 /* lower the metapath strip height */
1225 case DEALLOC_MP_LOWER:
1226 /* We're done with the current buffer, so release it,
1227 unless it's the dinode buffer. Then back up to the
1228 previous pointer. */
1230 brelse(mp.mp_bh[mp_h]);
1231 mp.mp_bh[mp_h] = NULL;
1233 /* If we can't get any lower in height, we've stripped
1234 off all we can. Next step is to back up and start
1235 stripping the previous level of metadata. */
1238 memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1240 state = DEALLOC_FILL_MP;
1243 mp.mp_list[mp_h] = 0;
1244 mp_h--; /* search one metadata height down */
1245 if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1246 break; /* loop around in the same state */
1248 /* Here we've found a part of the metapath that is not
1249 * allocated. We need to search at that height for the
1250 * next non-null pointer. */
1251 if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1252 state = DEALLOC_FILL_MP;
1255 /* No more non-null pointers at this height. Back up
1256 to the previous height and try again. */
1257 break; /* loop around in the same state */
1259 /* Fill the metapath with buffers to the given height. */
1260 case DEALLOC_FILL_MP:
1261 /* Fill the buffers out to the current height. */
1262 ret = fillup_metapath(ip, &mp, mp_h);
1266 /* If buffers found for the entire strip height */
1267 if ((ret == ip->i_height) && (mp_h == strip_h)) {
1268 state = DEALLOC_MP_FULL;
1271 if (ret < ip->i_height) /* We have a partial height */
1274 /* If we find a non-null block pointer, crawl a bit
1275 higher up in the metapath and try again, otherwise
1276 we need to look lower for a new starting point. */
1277 if (find_nonnull_ptr(sdp, &mp, mp_h))
1280 state = DEALLOC_MP_LOWER;
1286 if (current->journal_info == NULL) {
1287 ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1291 down_write(&ip->i_rw_mutex);
1293 gfs2_statfs_change(sdp, 0, +btotal, 0);
1294 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1296 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1297 gfs2_trans_add_meta(ip->i_gl, dibh);
1298 gfs2_dinode_out(ip, dibh->b_data);
1299 up_write(&ip->i_rw_mutex);
1300 gfs2_trans_end(sdp);
1304 if (gfs2_holder_initialized(&rd_gh))
1305 gfs2_glock_dq_uninit(&rd_gh);
1306 if (current->journal_info) {
1307 up_write(&ip->i_rw_mutex);
1308 gfs2_trans_end(sdp);
1311 gfs2_quota_unhold(ip);
1313 release_metapath(&mp);
1317 static int trunc_end(struct gfs2_inode *ip)
1319 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1320 struct buffer_head *dibh;
1323 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1327 down_write(&ip->i_rw_mutex);
1329 error = gfs2_meta_inode_buffer(ip, &dibh);
1333 if (!i_size_read(&ip->i_inode)) {
1335 ip->i_goal = ip->i_no_addr;
1336 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1337 gfs2_ordered_del_inode(ip);
1339 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1340 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1342 gfs2_trans_add_meta(ip->i_gl, dibh);
1343 gfs2_dinode_out(ip, dibh->b_data);
1347 up_write(&ip->i_rw_mutex);
1348 gfs2_trans_end(sdp);
1353 * do_shrink - make a file smaller
1355 * @oldsize: the current inode size
1356 * @newsize: the size to make the file
1358 * Called with an exclusive lock on @inode. The @size must
1359 * be equal to or smaller than the current inode size.
1364 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1366 struct gfs2_inode *ip = GFS2_I(inode);
1369 error = trunc_start(inode, oldsize, newsize);
1372 if (gfs2_is_stuffed(ip))
1375 error = trunc_dealloc(ip, newsize);
1377 error = trunc_end(ip);
1382 void gfs2_trim_blocks(struct inode *inode)
1384 u64 size = inode->i_size;
1387 ret = do_shrink(inode, size, size);
1392 * do_grow - Touch and update inode size
1394 * @size: The new size
1396 * This function updates the timestamps on the inode and
1397 * may also increase the size of the inode. This function
1398 * must not be called with @size any smaller than the current
1401 * Although it is not strictly required to unstuff files here,
1402 * earlier versions of GFS2 have a bug in the stuffed file reading
1403 * code which will result in a buffer overrun if the size is larger
1404 * than the max stuffed file size. In order to prevent this from
1405 * occurring, such files are unstuffed, but in other cases we can
1406 * just update the inode size directly.
1408 * Returns: 0 on success, or -ve on error
1411 static int do_grow(struct inode *inode, u64 size)
1413 struct gfs2_inode *ip = GFS2_I(inode);
1414 struct gfs2_sbd *sdp = GFS2_SB(inode);
1415 struct gfs2_alloc_parms ap = { .target = 1, };
1416 struct buffer_head *dibh;
1420 if (gfs2_is_stuffed(ip) &&
1421 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1422 error = gfs2_quota_lock_check(ip, &ap);
1426 error = gfs2_inplace_reserve(ip, &ap);
1428 goto do_grow_qunlock;
1432 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1433 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1436 goto do_grow_release;
1439 error = gfs2_unstuff_dinode(ip, NULL);
1444 error = gfs2_meta_inode_buffer(ip, &dibh);
1448 i_size_write(inode, size);
1449 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1450 gfs2_trans_add_meta(ip->i_gl, dibh);
1451 gfs2_dinode_out(ip, dibh->b_data);
1455 gfs2_trans_end(sdp);
1458 gfs2_inplace_release(ip);
1460 gfs2_quota_unlock(ip);
1466 * gfs2_setattr_size - make a file a given size
1468 * @newsize: the size to make the file
1470 * The file size can grow, shrink, or stay the same size. This
1471 * is called holding i_mutex and an exclusive glock on the inode
1477 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1479 struct gfs2_inode *ip = GFS2_I(inode);
1483 BUG_ON(!S_ISREG(inode->i_mode));
1485 ret = inode_newsize_ok(inode, newsize);
1489 inode_dio_wait(inode);
1491 ret = gfs2_rsqa_alloc(ip);
1495 oldsize = inode->i_size;
1496 if (newsize >= oldsize) {
1497 ret = do_grow(inode, newsize);
1501 ret = do_shrink(inode, oldsize, newsize);
1503 gfs2_rsqa_delete(ip, NULL);
1507 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1510 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1512 error = trunc_end(ip);
1516 int gfs2_file_dealloc(struct gfs2_inode *ip)
1518 return trunc_dealloc(ip, 0);
1522 * gfs2_free_journal_extents - Free cached journal bmap info
1527 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1529 struct gfs2_journal_extent *jext;
1531 while(!list_empty(&jd->extent_list)) {
1532 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1533 list_del(&jext->list);
1539 * gfs2_add_jextent - Add or merge a new extent to extent cache
1540 * @jd: The journal descriptor
1541 * @lblock: The logical block at start of new extent
1542 * @dblock: The physical block at start of new extent
1543 * @blocks: Size of extent in fs blocks
1545 * Returns: 0 on success or -ENOMEM
1548 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1550 struct gfs2_journal_extent *jext;
1552 if (!list_empty(&jd->extent_list)) {
1553 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1554 if ((jext->dblock + jext->blocks) == dblock) {
1555 jext->blocks += blocks;
1560 jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1563 jext->dblock = dblock;
1564 jext->lblock = lblock;
1565 jext->blocks = blocks;
1566 list_add_tail(&jext->list, &jd->extent_list);
1572 * gfs2_map_journal_extents - Cache journal bmap info
1573 * @sdp: The super block
1574 * @jd: The journal to map
1576 * Create a reusable "extent" mapping from all logical
1577 * blocks to all physical blocks for the given journal. This will save
1578 * us time when writing journal blocks. Most journals will have only one
1579 * extent that maps all their logical blocks. That's because gfs2.mkfs
1580 * arranges the journal blocks sequentially to maximize performance.
1581 * So the extent would map the first block for the entire file length.
1582 * However, gfs2_jadd can happen while file activity is happening, so
1583 * those journals may not be sequential. Less likely is the case where
1584 * the users created their own journals by mounting the metafs and
1585 * laying it out. But it's still possible. These journals might have
1588 * Returns: 0 on success, or error on failure
1591 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1595 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1596 struct buffer_head bh;
1597 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1601 lblock_stop = i_size_read(jd->jd_inode) >> shift;
1602 size = (lblock_stop - lblock) << shift;
1604 WARN_ON(!list_empty(&jd->extent_list));
1610 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1611 if (rc || !buffer_mapped(&bh))
1613 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1617 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1620 fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1625 fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1627 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1629 fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1630 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1631 bh.b_state, (unsigned long long)bh.b_size);
1632 gfs2_free_journal_extents(jd);
1637 * gfs2_write_alloc_required - figure out if a write will require an allocation
1638 * @ip: the file being written to
1639 * @offset: the offset to write to
1640 * @len: the number of bytes being written
1642 * Returns: 1 if an alloc is required, 0 otherwise
1645 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1648 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1649 struct buffer_head bh;
1651 u64 lblock, lblock_stop, size;
1657 if (gfs2_is_stuffed(ip)) {
1659 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1664 shift = sdp->sd_sb.sb_bsize_shift;
1665 BUG_ON(gfs2_is_dir(ip));
1666 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1667 lblock = offset >> shift;
1668 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1669 if (lblock_stop > end_of_file)
1672 size = (lblock_stop - lblock) << shift;
1676 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1677 if (!buffer_mapped(&bh))
1680 lblock += (bh.b_size >> ip->i_inode.i_blkbits);