fs/gfs2/bmap.c

   1 /*
   2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   4  *
   5  * This copyrighted material is made available to anyone wishing to use,
   6  * modify, copy, or redistribute it subject to the terms and conditions
   7  * of the GNU General Public License version 2.
   8  */
   9
  10 #include <linux/spinlock.h>
  11 #include <linux/completion.h>
  12 #include <linux/buffer_head.h>
  13 #include <linux/blkdev.h>
  14 #include <linux/gfs2_ondisk.h>
  15 #include <linux/crc32.h>
  16
  17 #include "gfs2.h"
  18 #include "incore.h"
  19 #include "bmap.h"
  20 #include "glock.h"
  21 #include "inode.h"
  22 #include "meta_io.h"
  23 #include "quota.h"
  24 #include "rgrp.h"
  25 #include "log.h"
  26 #include "super.h"
  27 #include "trans.h"
  28 #include "dir.h"
  29 #include "util.h"
  30 #include "trace_gfs2.h"
  31
  32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  33  * block is 512, so __u16 is fine for that. It saves stack space to
  34  * keep it small.
  35  */
  36 struct metapath {
  37         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39 };
  40
  41 /**
  42  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  43  * @ip: the inode
  44  * @dibh: the dinode buffer
  45  * @block: the block number that was allocated
  46  * @page: The (optional) page. This is looked up if @page is NULL
  47  *
  48  * Returns: errno
  49  */
  50
  51 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  52                                u64 block, struct page *page)
  53 {
  54         struct inode *inode = &ip->i_inode;
  55         struct buffer_head *bh;
  56         int release = 0;
  57
  58         if (!page || page->index) {
  59                 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
  60                 if (!page)
  61                         return -ENOMEM;
  62                 release = 1;
  63         }
  64
  65         if (!PageUptodate(page)) {
  66                 void *kaddr = kmap(page);
  67                 u64 dsize = i_size_read(inode);
  68
  69                 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
  70                         dsize = dibh->b_size - sizeof(struct gfs2_dinode);
  71
  72                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  73                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  74                 kunmap(page);
  75
  76                 SetPageUptodate(page);
  77         }
  78
  79         if (!page_has_buffers(page))
  80                 create_empty_buffers(page, BIT(inode->i_blkbits),
  81                                      BIT(BH_Uptodate));
  82
  83         bh = page_buffers(page);
  84
  85         if (!buffer_mapped(bh))
  86                 map_bh(bh, inode->i_sb, block);
  87
  88         set_buffer_uptodate(bh);
  89         if (!gfs2_is_jdata(ip))
  90                 mark_buffer_dirty(bh);
  91         if (!gfs2_is_writeback(ip))
  92                 gfs2_trans_add_data(ip->i_gl, bh);
  93
  94         if (release) {
  95                 unlock_page(page);
  96                 put_page(page);
  97         }
  98
  99         return 0;
 100 }
 101
 102 /**
 103  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 104  * @ip: The GFS2 inode to unstuff
 105  * @page: The (optional) page. This is looked up if the @page is NULL
 106  *
 107  * This routine unstuffs a dinode and returns it to a "normal" state such
 108  * that the height can be grown in the traditional way.
 109  *
 110  * Returns: errno
 111  */
 112
 113 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 114 {
 115         struct buffer_head *bh, *dibh;
 116         struct gfs2_dinode *di;
 117         u64 block = 0;
 118         int isdir = gfs2_is_dir(ip);
 119         int error;
 120
 121         down_write(&ip->i_rw_mutex);
 122
 123         error = gfs2_meta_inode_buffer(ip, &dibh);
 124         if (error)
 125                 goto out;
 126
 127         if (i_size_read(&ip->i_inode)) {
 128                 /* Get a free block, fill it with the stuffed data,
 129                    and write it out to disk */
 130
 131                 unsigned int n = 1;
 132                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 133                 if (error)
 134                         goto out_brelse;
 135                 if (isdir) {
 136                         gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 137                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 138                         if (error)
 139                                 goto out_brelse;
 140                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 141                                               dibh, sizeof(struct gfs2_dinode));
 142                         brelse(bh);
 143                 } else {
 144                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 145                         if (error)
 146                                 goto out_brelse;
 147                 }
 148         }
 149
 150         /*  Set up the pointer to the new block  */
 151
 152         gfs2_trans_add_meta(ip->i_gl, dibh);
 153         di = (struct gfs2_dinode *)dibh->b_data;
 154         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 155
 156         if (i_size_read(&ip->i_inode)) {
 157                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 158                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 159                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 160         }
 161
 162         ip->i_height = 1;
 163         di->di_height = cpu_to_be16(1);
 164
 165 out_brelse:
 166         brelse(dibh);
 167 out:
 168         up_write(&ip->i_rw_mutex);
 169         return error;
 170 }
 171
 172
 173 /**
 174  * find_metapath - Find path through the metadata tree
 175  * @sdp: The superblock
 176  * @mp: The metapath to return the result in
 177  * @block: The disk block to look up
 178  * @height: The pre-calculated height of the metadata tree
 179  *
 180  *   This routine returns a struct metapath structure that defines a path
 181  *   through the metadata of inode "ip" to get to block "block".
 182  *
 183  *   Example:
 184  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 185  *   filesystem with a blocksize of 4096.
 186  *
 187  *   find_metapath() would return a struct metapath structure set to:
 188  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
 189  *   and mp_list[2] = 165.
 190  *
 191  *   That means that in order to get to the block containing the byte at
 192  *   offset 101342453, we would load the indirect block pointed to by pointer
 193  *   0 in the dinode.  We would then load the indirect block pointed to by
 194  *   pointer 48 in that indirect block.  We would then load the data block
 195  *   pointed to by pointer 165 in that indirect block.
 196  *
 197  *             ----------------------------------------
 198  *             | Dinode |                             |
 199  *             |        |                            4|
 200  *             |        |0 1 2 3 4 5                 9|
 201  *             |        |                            6|
 202  *             ----------------------------------------
 203  *                       |
 204  *                       |
 205  *                       V
 206  *             ----------------------------------------
 207  *             | Indirect Block                       |
 208  *             |                                     5|
 209  *             |            4 4 4 4 4 5 5            1|
 210  *             |0           5 6 7 8 9 0 1            2|
 211  *             ----------------------------------------
 212  *                                |
 213  *                                |
 214  *                                V
 215  *             ----------------------------------------
 216  *             | Indirect Block                       |
 217  *             |                         1 1 1 1 1   5|
 218  *             |                         6 6 6 6 6   1|
 219  *             |0                        3 4 5 6 7   2|
 220  *             ----------------------------------------
 221  *                                           |
 222  *                                           |
 223  *                                           V
 224  *             ----------------------------------------
 225  *             | Data block containing offset         |
 226  *             |            101342453                 |
 227  *             |                                      |
 228  *             |                                      |
 229  *             ----------------------------------------
 230  *
 231  */
 232
 233 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 234                           struct metapath *mp, unsigned int height)
 235 {
 236         unsigned int i;
 237
 238         for (i = height; i--;)
 239                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 240
 241 }
 242
 243 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 244 {
 245         if (mp->mp_list[0] == 0)
 246                 return 2;
 247         return 1;
 248 }
 249
 250 /**
 251  * metaptr1 - Return the first possible metadata pointer in a metaath buffer
 252  * @height: The metadata height (0 = dinode)
 253  * @mp: The metapath
 254  */
 255 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 256 {
 257         struct buffer_head *bh = mp->mp_bh[height];
 258         if (height == 0)
 259                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 260         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 261 }
 262
 263 /**
 264  * metapointer - Return pointer to start of metadata in a buffer
 265  * @height: The metadata height (0 = dinode)
 266  * @mp: The metapath
 267  *
 268  * Return a pointer to the block number of the next height of the metadata
 269  * tree given a buffer containing the pointer to the current height of the
 270  * metadata tree.
 271  */
 272
 273 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 274 {
 275         __be64 *p = metaptr1(height, mp);
 276         return p + mp->mp_list[height];
 277 }
 278
 279 static void gfs2_metapath_ra(struct gfs2_glock *gl,
 280                              const struct buffer_head *bh, const __be64 *pos)
 281 {
 282         struct buffer_head *rabh;
 283         const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
 284         const __be64 *t;
 285
 286         for (t = pos; t < endp; t++) {
 287                 if (!*t)
 288                         continue;
 289
 290                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 291                 if (trylock_buffer(rabh)) {
 292                         if (!buffer_uptodate(rabh)) {
 293                                 rabh->b_end_io = end_buffer_read_sync;
 294                                 submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META,
 295                                                 rabh);
 296                                 continue;
 297                         }
 298                         unlock_buffer(rabh);
 299                 }
 300                 brelse(rabh);
 301         }
 302 }
 303
 304 /**
 305  * lookup_mp_height - helper function for lookup_metapath
 306  * @ip: the inode
 307  * @mp: the metapath
 308  * @h: the height which needs looking up
 309  */
 310 static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
 311 {
 312         __be64 *ptr = metapointer(h, mp);
 313         u64 dblock = be64_to_cpu(*ptr);
 314
 315         if (!dblock)
 316                 return h + 1;
 317
 318         return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
 319 }
 320
 321 /**
 322  * lookup_metapath - Walk the metadata tree to a specific point
 323  * @ip: The inode
 324  * @mp: The metapath
 325  *
 326  * Assumes that the inode's buffer has already been looked up and
 327  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 328  * by find_metapath().
 329  *
 330  * If this function encounters part of the tree which has not been
 331  * allocated, it returns the current height of the tree at the point
 332  * at which it found the unallocated block. Blocks which are found are
 333  * added to the mp->mp_bh[] list.
 334  *
 335  * Returns: error or height of metadata tree
 336  */
 337
 338 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 339 {
 340         unsigned int end_of_metadata = ip->i_height - 1;
 341         unsigned int x;
 342         int ret;
 343
 344         for (x = 0; x < end_of_metadata; x++) {
 345                 ret = lookup_mp_height(ip, mp, x);
 346                 if (ret)
 347                         return ret;
 348         }
 349
 350         return ip->i_height;
 351 }
 352
 353 /**
 354  * fillup_metapath - fill up buffers for the metadata path to a specific height
 355  * @ip: The inode
 356  * @mp: The metapath
 357  * @h: The height to which it should be mapped
 358  *
 359  * Similar to lookup_metapath, but does lookups for a range of heights
 360  *
 361  * Returns: error or height of metadata tree
 362  */
 363
 364 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 365 {
 366         unsigned int start_h = h - 1;
 367         int ret;
 368
 369         if (h) {
 370                 /* find the first buffer we need to look up. */
 371                 while (start_h > 0 && mp->mp_bh[start_h] == NULL)
 372                         start_h--;
 373                 for (; start_h < h; start_h++) {
 374                         ret = lookup_mp_height(ip, mp, start_h);
 375                         if (ret)
 376                                 return ret;
 377                 }
 378         }
 379         return ip->i_height;
 380 }
 381
 382 static inline void release_metapath(struct metapath *mp)
 383 {
 384         int i;
 385
 386         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 387                 if (mp->mp_bh[i] == NULL)
 388                         break;
 389                 brelse(mp->mp_bh[i]);
 390         }
 391 }
 392
 393 /**
 394  * gfs2_extent_length - Returns length of an extent of blocks
 395  * @start: Start of the buffer
 396  * @len: Length of the buffer in bytes
 397  * @ptr: Current position in the buffer
 398  * @limit: Max extent length to return (0 = unlimited)
 399  * @eob: Set to 1 if we hit "end of block"
 400  *
 401  * If the first block is zero (unallocated) it will return the number of
 402  * unallocated blocks in the extent, otherwise it will return the number
 403  * of contiguous blocks in the extent.
 404  *
 405  * Returns: The length of the extent (minimum of one block)
 406  */
 407
 408 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
 409 {
 410         const __be64 *end = (start + len);
 411         const __be64 *first = ptr;
 412         u64 d = be64_to_cpu(*ptr);
 413
 414         *eob = 0;
 415         do {
 416                 ptr++;
 417                 if (ptr >= end)
 418                         break;
 419                 if (limit && --limit == 0)
 420                         break;
 421                 if (d)
 422                         d++;
 423         } while(be64_to_cpu(*ptr) == d);
 424         if (ptr >= end)
 425                 *eob = 1;
 426         return (ptr - first);
 427 }
 428
 429 static inline void bmap_lock(struct gfs2_inode *ip, int create)
 430 {
 431         if (create)
 432                 down_write(&ip->i_rw_mutex);
 433         else
 434                 down_read(&ip->i_rw_mutex);
 435 }
 436
 437 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 438 {
 439         if (create)
 440                 up_write(&ip->i_rw_mutex);
 441         else
 442                 up_read(&ip->i_rw_mutex);
 443 }
 444
 445 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 446                                          struct gfs2_glock *gl, unsigned int i,
 447                                          unsigned offset, u64 bn)
 448 {
 449         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 450                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 451                                  sizeof(struct gfs2_dinode)));
 452         BUG_ON(i < 1);
 453         BUG_ON(mp->mp_bh[i] != NULL);
 454         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 455         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 456         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 457         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 458         ptr += offset;
 459         *ptr = cpu_to_be64(bn);
 460         return ptr;
 461 }
 462
 463 enum alloc_state {
 464         ALLOC_DATA = 0,
 465         ALLOC_GROW_DEPTH = 1,
 466         ALLOC_GROW_HEIGHT = 2,
 467         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 468 };
 469
 470 static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
 471 {
 472         if (hgt)
 473                 return sdp->sd_inptrs;
 474         return sdp->sd_diptrs;
 475 }
 476
 477 /**
 478  * gfs2_bmap_alloc - Build a metadata tree of the requested height
 479  * @inode: The GFS2 inode
 480  * @lblock: The logical starting block of the extent
 481  * @bh_map: This is used to return the mapping details
 482  * @mp: The metapath
 483  * @sheight: The starting height (i.e. whats already mapped)
 484  * @height: The height to build to
 485  * @maxlen: The max number of data blocks to alloc
 486  *
 487  * In this routine we may have to alloc:
 488  *   i) Indirect blocks to grow the metadata tree height
 489  *  ii) Indirect blocks to fill in lower part of the metadata tree
 490  * iii) Data blocks
 491  *
 492  * The function is in two parts. The first part works out the total
 493  * number of blocks which we need. The second part does the actual
 494  * allocation asking for an extent at a time (if enough contiguous free
 495  * blocks are available, there will only be one request per bmap call)
 496  * and uses the state machine to initialise the blocks in order.
 497  *
 498  * Returns: errno on error
 499  */
 500
 501 static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 502                            struct buffer_head *bh_map, struct metapath *mp,
 503                            const unsigned int sheight,
 504                            const unsigned int height,
 505                            const size_t maxlen)
 506 {
 507         struct gfs2_inode *ip = GFS2_I(inode);
 508         struct gfs2_sbd *sdp = GFS2_SB(inode);
 509         struct super_block *sb = sdp->sd_vfs;
 510         struct buffer_head *dibh = mp->mp_bh[0];
 511         u64 bn, dblock = 0;
 512         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 513         unsigned dblks = 0;
 514         unsigned ptrs_per_blk;
 515         const unsigned end_of_metadata = height - 1;
 516         int ret;
 517         int eob = 0;
 518         enum alloc_state state;
 519         __be64 *ptr;
 520         __be64 zero_bn = 0;
 521
 522         BUG_ON(sheight < 1);
 523         BUG_ON(dibh == NULL);
 524
 525         gfs2_trans_add_meta(ip->i_gl, dibh);
 526
 527         if (height == sheight) {
 528                 struct buffer_head *bh;
 529                 /* Bottom indirect block exists, find unalloced extent size */
 530                 ptr = metapointer(end_of_metadata, mp);
 531                 bh = mp->mp_bh[end_of_metadata];
 532                 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
 533                                            &eob);
 534                 BUG_ON(dblks < 1);
 535                 state = ALLOC_DATA;
 536         } else {
 537                 /* Need to allocate indirect blocks */
 538                 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
 539                 dblks = min(maxlen, (size_t)(ptrs_per_blk -
 540                                              mp->mp_list[end_of_metadata]));
 541                 if (height == ip->i_height) {
 542                         /* Writing into existing tree, extend tree down */
 543                         iblks = height - sheight;
 544                         state = ALLOC_GROW_DEPTH;
 545                 } else {
 546                         /* Building up tree height */
 547                         state = ALLOC_GROW_HEIGHT;
 548                         iblks = height - ip->i_height;
 549                         branch_start = metapath_branch_start(mp);
 550                         iblks += (height - branch_start);
 551                 }
 552         }
 553
 554         /* start of the second part of the function (state machine) */
 555
 556         blks = dblks + iblks;
 557         i = sheight;
 558         do {
 559                 int error;
 560                 n = blks - alloced;
 561                 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 562                 if (error)
 563                         return error;
 564                 alloced += n;
 565                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 566                         gfs2_trans_add_unrevoke(sdp, bn, n);
 567                 switch (state) {
 568                 /* Growing height of tree */
 569                 case ALLOC_GROW_HEIGHT:
 570                         if (i == 1) {
 571                                 ptr = (__be64 *)(dibh->b_data +
 572                                                  sizeof(struct gfs2_dinode));
 573                                 zero_bn = *ptr;
 574                         }
 575                         for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
 576                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 577                         if (i - 1 == height - ip->i_height) {
 578                                 i--;
 579                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 580                                                 sizeof(struct gfs2_meta_header),
 581                                                 dibh, sizeof(struct gfs2_dinode));
 582                                 gfs2_buffer_clear_tail(dibh,
 583                                                 sizeof(struct gfs2_dinode) +
 584                                                 sizeof(__be64));
 585                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 586                                         sizeof(struct gfs2_meta_header));
 587                                 *ptr = zero_bn;
 588                                 state = ALLOC_GROW_DEPTH;
 589                                 for(i = branch_start; i < height; i++) {
 590                                         if (mp->mp_bh[i] == NULL)
 591                                                 break;
 592                                         brelse(mp->mp_bh[i]);
 593                                         mp->mp_bh[i] = NULL;
 594                                 }
 595                                 i = branch_start;
 596                         }
 597                         if (n == 0)
 598                                 break;
 599                 /* Branching from existing tree */
 600                 case ALLOC_GROW_DEPTH:
 601                         if (i > 1 && i < height)
 602                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 603                         for (; i < height && n > 0; i++, n--)
 604                                 gfs2_indirect_init(mp, ip->i_gl, i,
 605                                                    mp->mp_list[i-1], bn++);
 606                         if (i == height)
 607                                 state = ALLOC_DATA;
 608                         if (n == 0)
 609                                 break;
 610                 /* Tree complete, adding data blocks */
 611                 case ALLOC_DATA:
 612                         BUG_ON(n > dblks);
 613                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 614                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 615                         dblks = n;
 616                         ptr = metapointer(end_of_metadata, mp);
 617                         dblock = bn;
 618                         while (n-- > 0)
 619                                 *ptr++ = cpu_to_be64(bn++);
 620                         if (buffer_zeronew(bh_map)) {
 621                                 ret = sb_issue_zeroout(sb, dblock, dblks,
 622                                                        GFP_NOFS);
 623                                 if (ret) {
 624                                         fs_err(sdp,
 625                                                "Failed to zero data buffers\n");
 626                                         clear_buffer_zeronew(bh_map);
 627                                 }
 628                         }
 629                         break;
 630                 }
 631         } while ((state != ALLOC_DATA) || !dblock);
 632
 633         ip->i_height = height;
 634         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 635         gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
 636         map_bh(bh_map, inode->i_sb, dblock);
 637         bh_map->b_size = dblks << inode->i_blkbits;
 638         set_buffer_new(bh_map);
 639         return 0;
 640 }
 641
 642 /**
 643  * gfs2_block_map - Map a block from an inode to a disk block
 644  * @inode: The inode
 645  * @lblock: The logical block number
 646  * @bh_map: The bh to be mapped
 647  * @create: True if its ok to alloc blocks to satify the request
 648  *
 649  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
 650  * read of metadata will be required before the next block can be
 651  * mapped. Sets buffer_new() if new blocks were allocated.
 652  *
 653  * Returns: errno
 654  */
 655
 656 int gfs2_block_map(struct inode *inode, sector_t lblock,
 657                    struct buffer_head *bh_map, int create)
 658 {
 659         struct gfs2_inode *ip = GFS2_I(inode);
 660         struct gfs2_sbd *sdp = GFS2_SB(inode);
 661         unsigned int bsize = sdp->sd_sb.sb_bsize;
 662         const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
 663         const u64 *arr = sdp->sd_heightsize;
 664         __be64 *ptr;
 665         u64 size;
 666         struct metapath mp;
 667         int ret;
 668         int eob;
 669         unsigned int len;
 670         struct buffer_head *bh;
 671         u8 height;
 672
 673         BUG_ON(maxlen == 0);
 674
 675         memset(&mp, 0, sizeof(mp));
 676         bmap_lock(ip, create);
 677         clear_buffer_mapped(bh_map);
 678         clear_buffer_new(bh_map);
 679         clear_buffer_boundary(bh_map);
 680         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
 681         if (gfs2_is_dir(ip)) {
 682                 bsize = sdp->sd_jbsize;
 683                 arr = sdp->sd_jheightsize;
 684         }
 685
 686         ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
 687         if (ret)
 688                 goto out;
 689
 690         height = ip->i_height;
 691         size = (lblock + 1) * bsize;
 692         while (size > arr[height])
 693                 height++;
 694         find_metapath(sdp, lblock, &mp, height);
 695         ret = 1;
 696         if (height > ip->i_height || gfs2_is_stuffed(ip))
 697                 goto do_alloc;
 698         ret = lookup_metapath(ip, &mp);
 699         if (ret < 0)
 700                 goto out;
 701         if (ret != ip->i_height)
 702                 goto do_alloc;
 703         ptr = metapointer(ip->i_height - 1, &mp);
 704         if (*ptr == 0)
 705                 goto do_alloc;
 706         map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
 707         bh = mp.mp_bh[ip->i_height - 1];
 708         len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
 709         bh_map->b_size = (len << inode->i_blkbits);
 710         if (eob)
 711                 set_buffer_boundary(bh_map);
 712         ret = 0;
 713 out:
 714         release_metapath(&mp);
 715         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
 716         bmap_unlock(ip, create);
 717         return ret;
 718
 719 do_alloc:
 720         /* All allocations are done here, firstly check create flag */
 721         if (!create) {
 722                 BUG_ON(gfs2_is_stuffed(ip));
 723                 ret = 0;
 724                 goto out;
 725         }
 726
 727         /* At this point ret is the tree depth of already allocated blocks */
 728         ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
 729         goto out;
 730 }
 731
 732 /*
 733  * Deprecated: do not use in new code
 734  */
 735 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 736 {
 737         struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
 738         int ret;
 739         int create = *new;
 740
 741         BUG_ON(!extlen);
 742         BUG_ON(!dblock);
 743         BUG_ON(!new);
 744
 745         bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
 746         ret = gfs2_block_map(inode, lblock, &bh, create);
 747         *extlen = bh.b_size >> inode->i_blkbits;
 748         *dblock = bh.b_blocknr;
 749         if (buffer_new(&bh))
 750                 *new = 1;
 751         else
 752                 *new = 0;
 753         return ret;
 754 }
 755
 756 /**
 757  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
 758  *
 759  * This is partly borrowed from ext3.
 760  */
 761 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 762 {
 763         struct inode *inode = mapping->host;
 764         struct gfs2_inode *ip = GFS2_I(inode);
 765         unsigned long index = from >> PAGE_SHIFT;
 766         unsigned offset = from & (PAGE_SIZE-1);
 767         unsigned blocksize, iblock, length, pos;
 768         struct buffer_head *bh;
 769         struct page *page;
 770         int err;
 771
 772         page = find_or_create_page(mapping, index, GFP_NOFS);
 773         if (!page)
 774                 return 0;
 775
 776         blocksize = inode->i_sb->s_blocksize;
 777         length = blocksize - (offset & (blocksize - 1));
 778         iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 779
 780         if (!page_has_buffers(page))
 781                 create_empty_buffers(page, blocksize, 0);
 782
 783         /* Find the buffer that contains "offset" */
 784         bh = page_buffers(page);
 785         pos = blocksize;
 786         while (offset >= pos) {
 787                 bh = bh->b_this_page;
 788                 iblock++;
 789                 pos += blocksize;
 790         }
 791
 792         err = 0;
 793
 794         if (!buffer_mapped(bh)) {
 795                 gfs2_block_map(inode, iblock, bh, 0);
 796                 /* unmapped? It's a hole - nothing to do */
 797                 if (!buffer_mapped(bh))
 798                         goto unlock;
 799         }
 800
 801         /* Ok, it's mapped. Make sure it's up-to-date */
 802         if (PageUptodate(page))
 803                 set_buffer_uptodate(bh);
 804
 805         if (!buffer_uptodate(bh)) {
 806                 err = -EIO;
 807                 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
 808                 wait_on_buffer(bh);
 809                 /* Uhhuh. Read error. Complain and punt. */
 810                 if (!buffer_uptodate(bh))
 811                         goto unlock;
 812                 err = 0;
 813         }
 814
 815         if (!gfs2_is_writeback(ip))
 816                 gfs2_trans_add_data(ip->i_gl, bh);
 817
 818         zero_user(page, offset, length);
 819         mark_buffer_dirty(bh);
 820 unlock:
 821         unlock_page(page);
 822         put_page(page);
 823         return err;
 824 }
 825
 826 #define GFS2_JTRUNC_REVOKES 8192
 827
 828 /**
 829  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
 830  * @inode: The inode being truncated
 831  * @oldsize: The original (larger) size
 832  * @newsize: The new smaller size
 833  *
 834  * With jdata files, we have to journal a revoke for each block which is
 835  * truncated. As a result, we need to split this into separate transactions
 836  * if the number of pages being truncated gets too large.
 837  */
 838
 839 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
 840 {
 841         struct gfs2_sbd *sdp = GFS2_SB(inode);
 842         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
 843         u64 chunk;
 844         int error;
 845
 846         while (oldsize != newsize) {
 847                 chunk = oldsize - newsize;
 848                 if (chunk > max_chunk)
 849                         chunk = max_chunk;
 850                 truncate_pagecache(inode, oldsize - chunk);
 851                 oldsize -= chunk;
 852                 gfs2_trans_end(sdp);
 853                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
 854                 if (error)
 855                         return error;
 856         }
 857
 858         return 0;
 859 }
 860
 861 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 862 {
 863         struct gfs2_inode *ip = GFS2_I(inode);
 864         struct gfs2_sbd *sdp = GFS2_SB(inode);
 865         struct address_space *mapping = inode->i_mapping;
 866         struct buffer_head *dibh;
 867         int journaled = gfs2_is_jdata(ip);
 868         int error;
 869
 870         if (journaled)
 871                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
 872         else
 873                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
 874         if (error)
 875                 return error;
 876
 877         error = gfs2_meta_inode_buffer(ip, &dibh);
 878         if (error)
 879                 goto out;
 880
 881         gfs2_trans_add_meta(ip->i_gl, dibh);
 882
 883         if (gfs2_is_stuffed(ip)) {
 884                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
 885         } else {
 886                 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
 887                         error = gfs2_block_truncate_page(mapping, newsize);
 888                         if (error)
 889                                 goto out_brelse;
 890                 }
 891                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 892         }
 893
 894         i_size_write(inode, newsize);
 895         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
 896         gfs2_dinode_out(ip, dibh->b_data);
 897
 898         if (journaled)
 899                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
 900         else
 901                 truncate_pagecache(inode, newsize);
 902
 903         if (error) {
 904                 brelse(dibh);
 905                 return error;
 906         }
 907
 908 out_brelse:
 909         brelse(dibh);
 910 out:
 911         gfs2_trans_end(sdp);
 912         return error;
 913 }
 914
 915 /**
 916  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
 917  * @ip: inode
 918  * @rg_gh: holder of resource group glock
 919  * @mp: current metapath fully populated with buffers
 920  * @btotal: place to keep count of total blocks freed
 921  * @hgt: height we're processing
 922  * @first: true if this is the first call to this function for this height
 923  *
 924  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
 925  * free, and free them all. However, we do it one rgrp at a time. If this
 926  * block has references to multiple rgrps, we break it into individual
 927  * transactions. This allows other processes to use the rgrps while we're
 928  * focused on a single one, for better concurrency / performance.
 929  * At every transaction boundary, we rewrite the inode into the journal.
 930  * That way the bitmaps are kept consistent with the inode and we can recover
 931  * if we're interrupted by power-outages.
 932  *
 933  * Returns: 0, or return code if an error occurred.
 934  *          *btotal has the total number of blocks freed
 935  */
 936 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
 937                               const struct metapath *mp, u32 *btotal, int hgt,
 938                               bool preserve1)
 939 {
 940         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 941         struct gfs2_rgrpd *rgd;
 942         struct gfs2_trans *tr;
 943         struct buffer_head *bh = mp->mp_bh[hgt];
 944         __be64 *top, *bottom, *p;
 945         int blks_outside_rgrp;
 946         u64 bn, bstart, isize_blks;
 947         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
 948         int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
 949         int ret = 0;
 950         bool buf_in_tr = false; /* buffer was added to transaction */
 951
 952         if (gfs2_metatype_check(sdp, bh,
 953                                 (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
 954                 return -EIO;
 955
 956 more_rgrps:
 957         blks_outside_rgrp = 0;
 958         bstart = 0;
 959         blen = 0;
 960         top = metapointer(hgt, mp); /* first ptr from metapath */
 961         /* If we're keeping some data at the truncation point, we've got to
 962            preserve the metadata tree by adding 1 to the starting metapath. */
 963         if (preserve1)
 964                 top++;
 965
 966         bottom = (__be64 *)(bh->b_data + bh->b_size);
 967
 968         for (p = top; p < bottom; p++) {
 969                 if (!*p)
 970                         continue;
 971                 bn = be64_to_cpu(*p);
 972                 if (gfs2_holder_initialized(rd_gh)) {
 973                         rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
 974                         gfs2_assert_withdraw(sdp,
 975                                      gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
 976                 } else {
 977                         rgd = gfs2_blk2rgrpd(sdp, bn, false);
 978                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
 979                                                  0, rd_gh);
 980                         if (ret)
 981                                 goto out;
 982
 983                         /* Must be done with the rgrp glock held: */
 984                         if (gfs2_rs_active(&ip->i_res) &&
 985                             rgd == ip->i_res.rs_rbm.rgd)
 986                                 gfs2_rs_deltree(&ip->i_res);
 987                 }
 988
 989                 if (!rgrp_contains_block(rgd, bn)) {
 990                         blks_outside_rgrp++;
 991                         continue;
 992                 }
 993
 994                 /* The size of our transactions will be unknown until we
 995                    actually process all the metadata blocks that relate to
 996                    the rgrp. So we estimate. We know it can't be more than
 997                    the dinode's i_blocks and we don't want to exceed the
 998                    journal flush threshold, sd_log_thresh2. */
 999                 if (current->journal_info == NULL) {
1000                         unsigned int jblocks_rqsted, revokes;
1001
1002                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1003                                 RES_INDIRECT;
1004                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1005                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1006                                 jblocks_rqsted +=
1007                                         atomic_read(&sdp->sd_log_thresh2);
1008                         else
1009                                 jblocks_rqsted += isize_blks;
1010                         revokes = jblocks_rqsted;
1011                         if (meta)
1012                                 revokes += hptrs(sdp, hgt);
1013                         else if (ip->i_depth)
1014                                 revokes += sdp->sd_inptrs;
1015                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1016                         if (ret)
1017                                 goto out_unlock;
1018                         down_write(&ip->i_rw_mutex);
1019                 }
1020                 /* check if we will exceed the transaction blocks requested */
1021                 tr = current->journal_info;
1022                 if (tr->tr_num_buf_new + RES_STATFS +
1023                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1024                         /* We set blks_outside_rgrp to ensure the loop will
1025                            be repeated for the same rgrp, but with a new
1026                            transaction. */
1027                         blks_outside_rgrp++;
1028                         /* This next part is tricky. If the buffer was added
1029                            to the transaction, we've already set some block
1030                            pointers to 0, so we better follow through and free
1031                            them, or we will introduce corruption (so break).
1032                            This may be impossible, or at least rare, but I
1033                            decided to cover the case regardless.
1034
1035                            If the buffer was not added to the transaction
1036                            (this call), doing so would exceed our transaction
1037                            size, so we need to end the transaction and start a
1038                            new one (so goto). */
1039
1040                         if (buf_in_tr)
1041                                 break;
1042                         goto out_unlock;
1043                 }
1044
1045                 gfs2_trans_add_meta(ip->i_gl, bh);
1046                 buf_in_tr = true;
1047                 *p = 0;
1048                 if (bstart + blen == bn) {
1049                         blen++;
1050                         continue;
1051                 }
1052                 if (bstart) {
1053                         __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1054                         (*btotal) += blen;
1055                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1056                 }
1057                 bstart = bn;
1058                 blen = 1;
1059         }
1060         if (bstart) {
1061                 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1062                 (*btotal) += blen;
1063                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1064         }
1065 out_unlock:
1066         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1067                                             outside the rgrp we just processed,
1068                                             do it all over again. */
1069                 if (current->journal_info) {
1070                         struct buffer_head *dibh = mp->mp_bh[0];
1071
1072                         /* Every transaction boundary, we rewrite the dinode
1073                            to keep its di_blocks current in case of failure. */
1074                         ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1075                                 current_time(&ip->i_inode);
1076                         gfs2_trans_add_meta(ip->i_gl, dibh);
1077                         gfs2_dinode_out(ip, dibh->b_data);
1078                         up_write(&ip->i_rw_mutex);
1079                         gfs2_trans_end(sdp);
1080                 }
1081                 gfs2_glock_dq_uninit(rd_gh);
1082                 cond_resched();
1083                 goto more_rgrps;
1084         }
1085 out:
1086         return ret;
1087 }
1088
1089 /**
1090  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1091  * assumes the metapath is valid (with buffers) out to height h
1092  * @mp: starting metapath
1093  * @h: desired height to search
1094  *
1095  * Returns: true if a non-null pointer was found in the metapath buffer
1096  *          false if all remaining pointers are NULL in the buffer
1097  */
1098 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1099                              unsigned int h)
1100 {
1101         __be64 *ptr;
1102         unsigned int ptrs = hptrs(sdp, h) - 1;
1103
1104         while (true) {
1105                 ptr = metapointer(h, mp);
1106                 if (*ptr) /* if we have a non-null pointer */
1107                         return true;
1108
1109                 if (mp->mp_list[h] < ptrs)
1110                         mp->mp_list[h]++;
1111                 else
1112                         return false; /* no more pointers in this buffer */
1113         }
1114 }
1115
1116 enum dealloc_states {
1117         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1118         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1119         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1120         DEALLOC_DONE = 3,       /* process complete */
1121 };
1122
1123 /**
1124  * trunc_dealloc - truncate a file down to a desired size
1125  * @ip: inode to truncate
1126  * @newsize: The desired size of the file
1127  *
1128  * This function truncates a file to newsize. It works from the
1129  * bottom up, and from the right to the left. In other words, it strips off
1130  * the highest layer (data) before stripping any of the metadata. Doing it
1131  * this way is best in case the operation is interrupted by power failure, etc.
1132  * The dinode is rewritten in every transaction to guarantee integrity.
1133  */
1134 static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1135 {
1136         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1137         struct metapath mp;
1138         struct buffer_head *dibh, *bh;
1139         struct gfs2_holder rd_gh;
1140         u64 lblock;
1141         __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1142         unsigned int strip_h = ip->i_height - 1;
1143         u32 btotal = 0;
1144         int ret, state;
1145         int mp_h; /* metapath buffers are read in to this height */
1146         sector_t last_ra = 0;
1147         u64 prev_bnr = 0;
1148         bool preserve1; /* need to preserve the first meta pointer? */
1149
1150         if (!newsize)
1151                 lblock = 0;
1152         else
1153                 lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1154
1155         memset(&mp, 0, sizeof(mp));
1156         find_metapath(sdp, lblock, &mp, ip->i_height);
1157
1158         memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1159
1160         ret = gfs2_meta_inode_buffer(ip, &dibh);
1161         if (ret)
1162                 return ret;
1163
1164         mp.mp_bh[0] = dibh;
1165         ret = lookup_metapath(ip, &mp);
1166         if (ret == ip->i_height)
1167                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1168         else
1169                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1170
1171         ret = gfs2_rindex_update(sdp);
1172         if (ret)
1173                 goto out_metapath;
1174
1175         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1176         if (ret)
1177                 goto out_metapath;
1178         gfs2_holder_mark_uninitialized(&rd_gh);
1179
1180         mp_h = strip_h;
1181
1182         while (state != DEALLOC_DONE) {
1183                 switch (state) {
1184                 /* Truncate a full metapath at the given strip height.
1185                  * Note that strip_h == mp_h in order to be in this state. */
1186                 case DEALLOC_MP_FULL:
1187                         if (mp_h > 0) { /* issue read-ahead on metadata */
1188                                 __be64 *top;
1189
1190                                 bh = mp.mp_bh[mp_h - 1];
1191                                 if (bh->b_blocknr != last_ra) {
1192                                         last_ra = bh->b_blocknr;
1193                                         top = metaptr1(mp_h - 1, &mp);
1194                                         gfs2_metapath_ra(ip->i_gl, bh, top);
1195                                 }
1196                         }
1197                         /* If we're truncating to a non-zero size and the mp is
1198                            at the beginning of file for the strip height, we
1199                            need to preserve the first metadata pointer. */
1200                         preserve1 = (newsize &&
1201                                      (mp.mp_list[mp_h] == nbof[mp_h]));
1202                         bh = mp.mp_bh[mp_h];
1203                         gfs2_assert_withdraw(sdp, bh);
1204                         if (gfs2_assert_withdraw(sdp,
1205                                                  prev_bnr != bh->b_blocknr)) {
1206                                 printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1207                                        "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1208                                        sdp->sd_fsname,
1209                                        (unsigned long long)ip->i_no_addr,
1210                                        prev_bnr, ip->i_height, strip_h, mp_h);
1211                         }
1212                         prev_bnr = bh->b_blocknr;
1213                         ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1214                                                  mp_h, preserve1);
1215                         /* If we hit an error or just swept dinode buffer,
1216                            just exit. */
1217                         if (ret || !mp_h) {
1218                                 state = DEALLOC_DONE;
1219                                 break;
1220                         }
1221                         state = DEALLOC_MP_LOWER;
1222                         break;
1223
1224                 /* lower the metapath strip height */
1225                 case DEALLOC_MP_LOWER:
1226                         /* We're done with the current buffer, so release it,
1227                            unless it's the dinode buffer. Then back up to the
1228                            previous pointer. */
1229                         if (mp_h) {
1230                                 brelse(mp.mp_bh[mp_h]);
1231                                 mp.mp_bh[mp_h] = NULL;
1232                         }
1233                         /* If we can't get any lower in height, we've stripped
1234                            off all we can. Next step is to back up and start
1235                            stripping the previous level of metadata. */
1236                         if (mp_h == 0) {
1237                                 strip_h--;
1238                                 memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1239                                 mp_h = strip_h;
1240                                 state = DEALLOC_FILL_MP;
1241                                 break;
1242                         }
1243                         mp.mp_list[mp_h] = 0;
1244                         mp_h--; /* search one metadata height down */
1245                         if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1246                                 break; /* loop around in the same state */
1247                         mp.mp_list[mp_h]++;
1248                         /* Here we've found a part of the metapath that is not
1249                          * allocated. We need to search at that height for the
1250                          * next non-null pointer. */
1251                         if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1252                                 state = DEALLOC_FILL_MP;
1253                                 mp_h++;
1254                         }
1255                         /* No more non-null pointers at this height. Back up
1256                            to the previous height and try again. */
1257                         break; /* loop around in the same state */
1258
1259                 /* Fill the metapath with buffers to the given height. */
1260                 case DEALLOC_FILL_MP:
1261                         /* Fill the buffers out to the current height. */
1262                         ret = fillup_metapath(ip, &mp, mp_h);
1263                         if (ret < 0)
1264                                 goto out;
1265
1266                         /* If buffers found for the entire strip height */
1267                         if ((ret == ip->i_height) && (mp_h == strip_h)) {
1268                                 state = DEALLOC_MP_FULL;
1269                                 break;
1270                         }
1271                         if (ret < ip->i_height) /* We have a partial height */
1272                                 mp_h = ret - 1;
1273
1274                         /* If we find a non-null block pointer, crawl a bit
1275                            higher up in the metapath and try again, otherwise
1276                            we need to look lower for a new starting point. */
1277                         if (find_nonnull_ptr(sdp, &mp, mp_h))
1278                                 mp_h++;
1279                         else
1280                                 state = DEALLOC_MP_LOWER;
1281                         break;
1282                 }
1283         }
1284
1285         if (btotal) {
1286                 if (current->journal_info == NULL) {
1287                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1288                                                RES_QUOTA, 0);
1289                         if (ret)
1290                                 goto out;
1291                         down_write(&ip->i_rw_mutex);
1292                 }
1293                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1294                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1295                                   ip->i_inode.i_gid);
1296                 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1297                 gfs2_trans_add_meta(ip->i_gl, dibh);
1298                 gfs2_dinode_out(ip, dibh->b_data);
1299                 up_write(&ip->i_rw_mutex);
1300                 gfs2_trans_end(sdp);
1301         }
1302
1303 out:
1304         if (gfs2_holder_initialized(&rd_gh))
1305                 gfs2_glock_dq_uninit(&rd_gh);
1306         if (current->journal_info) {
1307                 up_write(&ip->i_rw_mutex);
1308                 gfs2_trans_end(sdp);
1309                 cond_resched();
1310         }
1311         gfs2_quota_unhold(ip);
1312 out_metapath:
1313         release_metapath(&mp);
1314         return ret;
1315 }
1316
1317 static int trunc_end(struct gfs2_inode *ip)
1318 {
1319         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1320         struct buffer_head *dibh;
1321         int error;
1322
1323         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1324         if (error)
1325                 return error;
1326
1327         down_write(&ip->i_rw_mutex);
1328
1329         error = gfs2_meta_inode_buffer(ip, &dibh);
1330         if (error)
1331                 goto out;
1332
1333         if (!i_size_read(&ip->i_inode)) {
1334                 ip->i_height = 0;
1335                 ip->i_goal = ip->i_no_addr;
1336                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1337                 gfs2_ordered_del_inode(ip);
1338         }
1339         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1340         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1341
1342         gfs2_trans_add_meta(ip->i_gl, dibh);
1343         gfs2_dinode_out(ip, dibh->b_data);
1344         brelse(dibh);
1345
1346 out:
1347         up_write(&ip->i_rw_mutex);
1348         gfs2_trans_end(sdp);
1349         return error;
1350 }
1351
1352 /**
1353  * do_shrink - make a file smaller
1354  * @inode: the inode
1355  * @oldsize: the current inode size
1356  * @newsize: the size to make the file
1357  *
1358  * Called with an exclusive lock on @inode. The @size must
1359  * be equal to or smaller than the current inode size.
1360  *
1361  * Returns: errno
1362  */
1363
1364 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1365 {
1366         struct gfs2_inode *ip = GFS2_I(inode);
1367         int error;
1368
1369         error = trunc_start(inode, oldsize, newsize);
1370         if (error < 0)
1371                 return error;
1372         if (gfs2_is_stuffed(ip))
1373                 return 0;
1374
1375         error = trunc_dealloc(ip, newsize);
1376         if (error == 0)
1377                 error = trunc_end(ip);
1378
1379         return error;
1380 }
1381
1382 void gfs2_trim_blocks(struct inode *inode)
1383 {
1384         u64 size = inode->i_size;
1385         int ret;
1386
1387         ret = do_shrink(inode, size, size);
1388         WARN_ON(ret != 0);
1389 }
1390
1391 /**
1392  * do_grow - Touch and update inode size
1393  * @inode: The inode
1394  * @size: The new size
1395  *
1396  * This function updates the timestamps on the inode and
1397  * may also increase the size of the inode. This function
1398  * must not be called with @size any smaller than the current
1399  * inode size.
1400  *
1401  * Although it is not strictly required to unstuff files here,
1402  * earlier versions of GFS2 have a bug in the stuffed file reading
1403  * code which will result in a buffer overrun if the size is larger
1404  * than the max stuffed file size. In order to prevent this from
1405  * occurring, such files are unstuffed, but in other cases we can
1406  * just update the inode size directly.
1407  *
1408  * Returns: 0 on success, or -ve on error
1409  */
1410
1411 static int do_grow(struct inode *inode, u64 size)
1412 {
1413         struct gfs2_inode *ip = GFS2_I(inode);
1414         struct gfs2_sbd *sdp = GFS2_SB(inode);
1415         struct gfs2_alloc_parms ap = { .target = 1, };
1416         struct buffer_head *dibh;
1417         int error;
1418         int unstuff = 0;
1419
1420         if (gfs2_is_stuffed(ip) &&
1421             (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1422                 error = gfs2_quota_lock_check(ip, &ap);
1423                 if (error)
1424                         return error;
1425
1426                 error = gfs2_inplace_reserve(ip, &ap);
1427                 if (error)
1428                         goto do_grow_qunlock;
1429                 unstuff = 1;
1430         }
1431
1432         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1433                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1434                                   0 : RES_QUOTA), 0);
1435         if (error)
1436                 goto do_grow_release;
1437
1438         if (unstuff) {
1439                 error = gfs2_unstuff_dinode(ip, NULL);
1440                 if (error)
1441                         goto do_end_trans;
1442         }
1443
1444         error = gfs2_meta_inode_buffer(ip, &dibh);
1445         if (error)
1446                 goto do_end_trans;
1447
1448         i_size_write(inode, size);
1449         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1450         gfs2_trans_add_meta(ip->i_gl, dibh);
1451         gfs2_dinode_out(ip, dibh->b_data);
1452         brelse(dibh);
1453
1454 do_end_trans:
1455         gfs2_trans_end(sdp);
1456 do_grow_release:
1457         if (unstuff) {
1458                 gfs2_inplace_release(ip);
1459 do_grow_qunlock:
1460                 gfs2_quota_unlock(ip);
1461         }
1462         return error;
1463 }
1464
1465 /**
1466  * gfs2_setattr_size - make a file a given size
1467  * @inode: the inode
1468  * @newsize: the size to make the file
1469  *
1470  * The file size can grow, shrink, or stay the same size. This
1471  * is called holding i_mutex and an exclusive glock on the inode
1472  * in question.
1473  *
1474  * Returns: errno
1475  */
1476
1477 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1478 {
1479         struct gfs2_inode *ip = GFS2_I(inode);
1480         int ret;
1481         u64 oldsize;
1482
1483         BUG_ON(!S_ISREG(inode->i_mode));
1484
1485         ret = inode_newsize_ok(inode, newsize);
1486         if (ret)
1487                 return ret;
1488
1489         inode_dio_wait(inode);
1490
1491         ret = gfs2_rsqa_alloc(ip);
1492         if (ret)
1493                 goto out;
1494
1495         oldsize = inode->i_size;
1496         if (newsize >= oldsize) {
1497                 ret = do_grow(inode, newsize);
1498                 goto out;
1499         }
1500
1501         ret = do_shrink(inode, oldsize, newsize);
1502 out:
1503         gfs2_rsqa_delete(ip, NULL);
1504         return ret;
1505 }
1506
1507 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1508 {
1509         int error;
1510         error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1511         if (!error)
1512                 error = trunc_end(ip);
1513         return error;
1514 }
1515
1516 int gfs2_file_dealloc(struct gfs2_inode *ip)
1517 {
1518         return trunc_dealloc(ip, 0);
1519 }
1520
1521 /**
1522  * gfs2_free_journal_extents - Free cached journal bmap info
1523  * @jd: The journal
1524  *
1525  */
1526
1527 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1528 {
1529         struct gfs2_journal_extent *jext;
1530
1531         while(!list_empty(&jd->extent_list)) {
1532                 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1533                 list_del(&jext->list);
1534                 kfree(jext);
1535         }
1536 }
1537
1538 /**
1539  * gfs2_add_jextent - Add or merge a new extent to extent cache
1540  * @jd: The journal descriptor
1541  * @lblock: The logical block at start of new extent
1542  * @dblock: The physical block at start of new extent
1543  * @blocks: Size of extent in fs blocks
1544  *
1545  * Returns: 0 on success or -ENOMEM
1546  */
1547
1548 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1549 {
1550         struct gfs2_journal_extent *jext;
1551
1552         if (!list_empty(&jd->extent_list)) {
1553                 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1554                 if ((jext->dblock + jext->blocks) == dblock) {
1555                         jext->blocks += blocks;
1556                         return 0;
1557                 }
1558         }
1559
1560         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1561         if (jext == NULL)
1562                 return -ENOMEM;
1563         jext->dblock = dblock;
1564         jext->lblock = lblock;
1565         jext->blocks = blocks;
1566         list_add_tail(&jext->list, &jd->extent_list);
1567         jd->nr_extents++;
1568         return 0;
1569 }
1570
1571 /**
1572  * gfs2_map_journal_extents - Cache journal bmap info
1573  * @sdp: The super block
1574  * @jd: The journal to map
1575  *
1576  * Create a reusable "extent" mapping from all logical
1577  * blocks to all physical blocks for the given journal.  This will save
1578  * us time when writing journal blocks.  Most journals will have only one
1579  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1580  * arranges the journal blocks sequentially to maximize performance.
1581  * So the extent would map the first block for the entire file length.
1582  * However, gfs2_jadd can happen while file activity is happening, so
1583  * those journals may not be sequential.  Less likely is the case where
1584  * the users created their own journals by mounting the metafs and
1585  * laying it out.  But it's still possible.  These journals might have
1586  * several extents.
1587  *
1588  * Returns: 0 on success, or error on failure
1589  */
1590
1591 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1592 {
1593         u64 lblock = 0;
1594         u64 lblock_stop;
1595         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1596         struct buffer_head bh;
1597         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1598         u64 size;
1599         int rc;
1600
1601         lblock_stop = i_size_read(jd->jd_inode) >> shift;
1602         size = (lblock_stop - lblock) << shift;
1603         jd->nr_extents = 0;
1604         WARN_ON(!list_empty(&jd->extent_list));
1605
1606         do {
1607                 bh.b_state = 0;
1608                 bh.b_blocknr = 0;
1609                 bh.b_size = size;
1610                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1611                 if (rc || !buffer_mapped(&bh))
1612                         goto fail;
1613                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1614                 if (rc)
1615                         goto fail;
1616                 size -= bh.b_size;
1617                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1618         } while(size > 0);
1619
1620         fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1621                 jd->nr_extents);
1622         return 0;
1623
1624 fail:
1625         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1626                 rc, jd->jd_jid,
1627                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1628                 jd->nr_extents);
1629         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1630                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1631                 bh.b_state, (unsigned long long)bh.b_size);
1632         gfs2_free_journal_extents(jd);
1633         return rc;
1634 }
1635
1636 /**
1637  * gfs2_write_alloc_required - figure out if a write will require an allocation
1638  * @ip: the file being written to
1639  * @offset: the offset to write to
1640  * @len: the number of bytes being written
1641  *
1642  * Returns: 1 if an alloc is required, 0 otherwise
1643  */
1644
1645 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1646                               unsigned int len)
1647 {
1648         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1649         struct buffer_head bh;
1650         unsigned int shift;
1651         u64 lblock, lblock_stop, size;
1652         u64 end_of_file;
1653
1654         if (!len)
1655                 return 0;
1656
1657         if (gfs2_is_stuffed(ip)) {
1658                 if (offset + len >
1659                     sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1660                         return 1;
1661                 return 0;
1662         }
1663
1664         shift = sdp->sd_sb.sb_bsize_shift;
1665         BUG_ON(gfs2_is_dir(ip));
1666         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1667         lblock = offset >> shift;
1668         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1669         if (lblock_stop > end_of_file)
1670                 return 1;
1671
1672         size = (lblock_stop - lblock) << shift;
1673         do {
1674                 bh.b_state = 0;
1675                 bh.b_size = size;
1676                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1677                 if (!buffer_mapped(&bh))
1678                         return 1;
1679                 size -= bh.b_size;
1680                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1681         } while(size > 0);
1682
1683         return 0;
1684 }
1685