fs/btrfs/inode.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <crypto/hash.h>
   7 #include <linux/kernel.h>
   8 #include <linux/bio.h>
   9 #include <linux/blk-cgroup.h>
  10 #include <linux/file.h>
  11 #include <linux/fs.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <linux/time.h>
  15 #include <linux/init.h>
  16 #include <linux/string.h>
  17 #include <linux/backing-dev.h>
  18 #include <linux/writeback.h>
  19 #include <linux/compat.h>
  20 #include <linux/xattr.h>
  21 #include <linux/posix_acl.h>
  22 #include <linux/falloc.h>
  23 #include <linux/slab.h>
  24 #include <linux/ratelimit.h>
  25 #include <linux/btrfs.h>
  26 #include <linux/blkdev.h>
  27 #include <linux/posix_acl_xattr.h>
  28 #include <linux/uio.h>
  29 #include <linux/magic.h>
  30 #include <linux/iversion.h>
  31 #include <linux/swap.h>
  32 #include <linux/migrate.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/iomap.h>
  35 #include <asm/unaligned.h>
  36 #include <linux/fsverity.h>
  37 #include "misc.h"
  38 #include "ctree.h"
  39 #include "disk-io.h"
  40 #include "transaction.h"
  41 #include "btrfs_inode.h"
  42 #include "print-tree.h"
  43 #include "ordered-data.h"
  44 #include "xattr.h"
  45 #include "tree-log.h"
  46 #include "volumes.h"
  47 #include "compression.h"
  48 #include "locking.h"
  49 #include "free-space-cache.h"
  50 #include "props.h"
  51 #include "qgroup.h"
  52 #include "delalloc-space.h"
  53 #include "block-group.h"
  54 #include "space-info.h"
  55 #include "zoned.h"
  56 #include "subpage.h"
  57 #include "inode-item.h"
  58
  59 struct btrfs_iget_args {
  60         u64 ino;
  61         struct btrfs_root *root;
  62 };
  63
  64 struct btrfs_dio_data {
  65         ssize_t submitted;
  66         struct extent_changeset *data_reserved;
  67 };
  68
  69 struct btrfs_rename_ctx {
  70         /* Output field. Stores the index number of the old directory entry. */
  71         u64 index;
  72 };
  73
  74 static const struct inode_operations btrfs_dir_inode_operations;
  75 static const struct inode_operations btrfs_symlink_inode_operations;
  76 static const struct inode_operations btrfs_special_inode_operations;
  77 static const struct inode_operations btrfs_file_inode_operations;
  78 static const struct address_space_operations btrfs_aops;
  79 static const struct file_operations btrfs_dir_file_operations;
  80
  81 static struct kmem_cache *btrfs_inode_cachep;
  82 struct kmem_cache *btrfs_trans_handle_cachep;
  83 struct kmem_cache *btrfs_path_cachep;
  84 struct kmem_cache *btrfs_free_space_cachep;
  85 struct kmem_cache *btrfs_free_space_bitmap_cachep;
  86
  87 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  88 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
  89 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  90 static noinline int cow_file_range(struct btrfs_inode *inode,
  91                                    struct page *locked_page,
  92                                    u64 start, u64 end, int *page_started,
  93                                    unsigned long *nr_written, int unlock);
  94 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
  95                                        u64 len, u64 orig_start, u64 block_start,
  96                                        u64 block_len, u64 orig_block_len,
  97                                        u64 ram_bytes, int compress_type,
  98                                        int type);
  99
 100 static void __endio_write_update_ordered(struct btrfs_inode *inode,
 101                                          const u64 offset, const u64 bytes,
 102                                          const bool uptodate);
 103
 104 /*
 105  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
 106  *
 107  * ilock_flags can have the following bit set:
 108  *
 109  * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
 110  * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
 111  *                   return -EAGAIN
 112  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
 113  */
 114 int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
 115 {
 116         if (ilock_flags & BTRFS_ILOCK_SHARED) {
 117                 if (ilock_flags & BTRFS_ILOCK_TRY) {
 118                         if (!inode_trylock_shared(inode))
 119                                 return -EAGAIN;
 120                         else
 121                                 return 0;
 122                 }
 123                 inode_lock_shared(inode);
 124         } else {
 125                 if (ilock_flags & BTRFS_ILOCK_TRY) {
 126                         if (!inode_trylock(inode))
 127                                 return -EAGAIN;
 128                         else
 129                                 return 0;
 130                 }
 131                 inode_lock(inode);
 132         }
 133         if (ilock_flags & BTRFS_ILOCK_MMAP)
 134                 down_write(&BTRFS_I(inode)->i_mmap_lock);
 135         return 0;
 136 }
 137
 138 /*
 139  * btrfs_inode_unlock - unock inode i_rwsem
 140  *
 141  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
 142  * to decide whether the lock acquired is shared or exclusive.
 143  */
 144 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
 145 {
 146         if (ilock_flags & BTRFS_ILOCK_MMAP)
 147                 up_write(&BTRFS_I(inode)->i_mmap_lock);
 148         if (ilock_flags & BTRFS_ILOCK_SHARED)
 149                 inode_unlock_shared(inode);
 150         else
 151                 inode_unlock(inode);
 152 }
 153
 154 /*
 155  * Cleanup all submitted ordered extents in specified range to handle errors
 156  * from the btrfs_run_delalloc_range() callback.
 157  *
 158  * NOTE: caller must ensure that when an error happens, it can not call
 159  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 160  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 161  * to be released, which we want to happen only when finishing the ordered
 162  * extent (btrfs_finish_ordered_io()).
 163  */
 164 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 165                                                  struct page *locked_page,
 166                                                  u64 offset, u64 bytes)
 167 {
 168         unsigned long index = offset >> PAGE_SHIFT;
 169         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
 170         u64 page_start = page_offset(locked_page);
 171         u64 page_end = page_start + PAGE_SIZE - 1;
 172
 173         struct page *page;
 174
 175         while (index <= end_index) {
 176                 /*
 177                  * For locked page, we will call end_extent_writepage() on it
 178                  * in run_delalloc_range() for the error handling.  That
 179                  * end_extent_writepage() function will call
 180                  * btrfs_mark_ordered_io_finished() to clear page Ordered and
 181                  * run the ordered extent accounting.
 182                  *
 183                  * Here we can't just clear the Ordered bit, or
 184                  * btrfs_mark_ordered_io_finished() would skip the accounting
 185                  * for the page range, and the ordered extent will never finish.
 186                  */
 187                 if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
 188                         index++;
 189                         continue;
 190                 }
 191                 page = find_get_page(inode->vfs_inode.i_mapping, index);
 192                 index++;
 193                 if (!page)
 194                         continue;
 195
 196                 /*
 197                  * Here we just clear all Ordered bits for every page in the
 198                  * range, then __endio_write_update_ordered() will handle
 199                  * the ordered extent accounting for the range.
 200                  */
 201                 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
 202                                                offset, bytes);
 203                 put_page(page);
 204         }
 205
 206         /* The locked page covers the full range, nothing needs to be done */
 207         if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
 208                 return;
 209         /*
 210          * In case this page belongs to the delalloc range being instantiated
 211          * then skip it, since the first page of a range is going to be
 212          * properly cleaned up by the caller of run_delalloc_range
 213          */
 214         if (page_start >= offset && page_end <= (offset + bytes - 1)) {
 215                 bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
 216                 offset = page_offset(locked_page) + PAGE_SIZE;
 217         }
 218
 219         return __endio_write_update_ordered(inode, offset, bytes, false);
 220 }
 221
 222 static int btrfs_dirty_inode(struct inode *inode);
 223
 224 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 225                                      struct inode *inode,  struct inode *dir,
 226                                      const struct qstr *qstr)
 227 {
 228         int err;
 229
 230         err = btrfs_init_acl(trans, inode, dir);
 231         if (!err)
 232                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 233         return err;
 234 }
 235
 236 /*
 237  * this does all the hard work for inserting an inline extent into
 238  * the btree.  The caller should have done a btrfs_drop_extents so that
 239  * no overlapping inline items exist in the btree
 240  */
 241 static int insert_inline_extent(struct btrfs_trans_handle *trans,
 242                                 struct btrfs_path *path,
 243                                 struct btrfs_inode *inode, bool extent_inserted,
 244                                 size_t size, size_t compressed_size,
 245                                 int compress_type,
 246                                 struct page **compressed_pages,
 247                                 bool update_i_size)
 248 {
 249         struct btrfs_root *root = inode->root;
 250         struct extent_buffer *leaf;
 251         struct page *page = NULL;
 252         char *kaddr;
 253         unsigned long ptr;
 254         struct btrfs_file_extent_item *ei;
 255         int ret;
 256         size_t cur_size = size;
 257         u64 i_size;
 258
 259         ASSERT((compressed_size > 0 && compressed_pages) ||
 260                (compressed_size == 0 && !compressed_pages));
 261
 262         if (compressed_size && compressed_pages)
 263                 cur_size = compressed_size;
 264
 265         if (!extent_inserted) {
 266                 struct btrfs_key key;
 267                 size_t datasize;
 268
 269                 key.objectid = btrfs_ino(inode);
 270                 key.offset = 0;
 271                 key.type = BTRFS_EXTENT_DATA_KEY;
 272
 273                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
 274                 ret = btrfs_insert_empty_item(trans, root, path, &key,
 275                                               datasize);
 276                 if (ret)
 277                         goto fail;
 278         }
 279         leaf = path->nodes[0];
 280         ei = btrfs_item_ptr(leaf, path->slots[0],
 281                             struct btrfs_file_extent_item);
 282         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 283         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 284         btrfs_set_file_extent_encryption(leaf, ei, 0);
 285         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 286         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 287         ptr = btrfs_file_extent_inline_start(ei);
 288
 289         if (compress_type != BTRFS_COMPRESS_NONE) {
 290                 struct page *cpage;
 291                 int i = 0;
 292                 while (compressed_size > 0) {
 293                         cpage = compressed_pages[i];
 294                         cur_size = min_t(unsigned long, compressed_size,
 295                                        PAGE_SIZE);
 296
 297                         kaddr = kmap_atomic(cpage);
 298                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
 299                         kunmap_atomic(kaddr);
 300
 301                         i++;
 302                         ptr += cur_size;
 303                         compressed_size -= cur_size;
 304                 }
 305                 btrfs_set_file_extent_compression(leaf, ei,
 306                                                   compress_type);
 307         } else {
 308                 page = find_get_page(inode->vfs_inode.i_mapping, 0);
 309                 btrfs_set_file_extent_compression(leaf, ei, 0);
 310                 kaddr = kmap_atomic(page);
 311                 write_extent_buffer(leaf, kaddr, ptr, size);
 312                 kunmap_atomic(kaddr);
 313                 put_page(page);
 314         }
 315         btrfs_mark_buffer_dirty(leaf);
 316         btrfs_release_path(path);
 317
 318         /*
 319          * We align size to sectorsize for inline extents just for simplicity
 320          * sake.
 321          */
 322         ret = btrfs_inode_set_file_extent_range(inode, 0,
 323                                         ALIGN(size, root->fs_info->sectorsize));
 324         if (ret)
 325                 goto fail;
 326
 327         /*
 328          * We're an inline extent, so nobody can extend the file past i_size
 329          * without locking a page we already have locked.
 330          *
 331          * We must do any i_size and inode updates before we unlock the pages.
 332          * Otherwise we could end up racing with unlink.
 333          */
 334         i_size = i_size_read(&inode->vfs_inode);
 335         if (update_i_size && size > i_size) {
 336                 i_size_write(&inode->vfs_inode, size);
 337                 i_size = size;
 338         }
 339         inode->disk_i_size = i_size;
 340
 341 fail:
 342         return ret;
 343 }
 344
 345
 346 /*
 347  * conditionally insert an inline extent into the file.  This
 348  * does the checks required to make sure the data is small enough
 349  * to fit as an inline extent.
 350  */
 351 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 352                                           size_t compressed_size,
 353                                           int compress_type,
 354                                           struct page **compressed_pages,
 355                                           bool update_i_size)
 356 {
 357         struct btrfs_drop_extents_args drop_args = { 0 };
 358         struct btrfs_root *root = inode->root;
 359         struct btrfs_fs_info *fs_info = root->fs_info;
 360         struct btrfs_trans_handle *trans;
 361         u64 data_len = (compressed_size ?: size);
 362         int ret;
 363         struct btrfs_path *path;
 364
 365         /*
 366          * We can create an inline extent if it ends at or beyond the current
 367          * i_size, is no larger than a sector (decompressed), and the (possibly
 368          * compressed) data fits in a leaf and the configured maximum inline
 369          * size.
 370          */
 371         if (size < i_size_read(&inode->vfs_inode) ||
 372             size > fs_info->sectorsize ||
 373             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
 374             data_len > fs_info->max_inline)
 375                 return 1;
 376
 377         path = btrfs_alloc_path();
 378         if (!path)
 379                 return -ENOMEM;
 380
 381         trans = btrfs_join_transaction(root);
 382         if (IS_ERR(trans)) {
 383                 btrfs_free_path(path);
 384                 return PTR_ERR(trans);
 385         }
 386         trans->block_rsv = &inode->block_rsv;
 387
 388         drop_args.path = path;
 389         drop_args.start = 0;
 390         drop_args.end = fs_info->sectorsize;
 391         drop_args.drop_cache = true;
 392         drop_args.replace_extent = true;
 393         drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
 394         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 395         if (ret) {
 396                 btrfs_abort_transaction(trans, ret);
 397                 goto out;
 398         }
 399
 400         ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
 401                                    size, compressed_size, compress_type,
 402                                    compressed_pages, update_i_size);
 403         if (ret && ret != -ENOSPC) {
 404                 btrfs_abort_transaction(trans, ret);
 405                 goto out;
 406         } else if (ret == -ENOSPC) {
 407                 ret = 1;
 408                 goto out;
 409         }
 410
 411         btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
 412         ret = btrfs_update_inode(trans, root, inode);
 413         if (ret && ret != -ENOSPC) {
 414                 btrfs_abort_transaction(trans, ret);
 415                 goto out;
 416         } else if (ret == -ENOSPC) {
 417                 ret = 1;
 418                 goto out;
 419         }
 420
 421         btrfs_set_inode_full_sync(inode);
 422 out:
 423         /*
 424          * Don't forget to free the reserved space, as for inlined extent
 425          * it won't count as data extent, free them directly here.
 426          * And at reserve time, it's always aligned to page size, so
 427          * just free one page here.
 428          */
 429         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
 430         btrfs_free_path(path);
 431         btrfs_end_transaction(trans);
 432         return ret;
 433 }
 434
 435 struct async_extent {
 436         u64 start;
 437         u64 ram_size;
 438         u64 compressed_size;
 439         struct page **pages;
 440         unsigned long nr_pages;
 441         int compress_type;
 442         struct list_head list;
 443 };
 444
 445 struct async_chunk {
 446         struct inode *inode;
 447         struct page *locked_page;
 448         u64 start;
 449         u64 end;
 450         unsigned int write_flags;
 451         struct list_head extents;
 452         struct cgroup_subsys_state *blkcg_css;
 453         struct btrfs_work work;
 454         struct async_cow *async_cow;
 455 };
 456
 457 struct async_cow {
 458         atomic_t num_chunks;
 459         struct async_chunk chunks[];
 460 };
 461
 462 static noinline int add_async_extent(struct async_chunk *cow,
 463                                      u64 start, u64 ram_size,
 464                                      u64 compressed_size,
 465                                      struct page **pages,
 466                                      unsigned long nr_pages,
 467                                      int compress_type)
 468 {
 469         struct async_extent *async_extent;
 470
 471         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 472         BUG_ON(!async_extent); /* -ENOMEM */
 473         async_extent->start = start;
 474         async_extent->ram_size = ram_size;
 475         async_extent->compressed_size = compressed_size;
 476         async_extent->pages = pages;
 477         async_extent->nr_pages = nr_pages;
 478         async_extent->compress_type = compress_type;
 479         list_add_tail(&async_extent->list, &cow->extents);
 480         return 0;
 481 }
 482
 483 /*
 484  * Check if the inode has flags compatible with compression
 485  */
 486 static inline bool inode_can_compress(struct btrfs_inode *inode)
 487 {
 488         if (inode->flags & BTRFS_INODE_NODATACOW ||
 489             inode->flags & BTRFS_INODE_NODATASUM)
 490                 return false;
 491         return true;
 492 }
 493
 494 /*
 495  * Check if the inode needs to be submitted to compression, based on mount
 496  * options, defragmentation, properties or heuristics.
 497  */
 498 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
 499                                       u64 end)
 500 {
 501         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 502
 503         if (!inode_can_compress(inode)) {
 504                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 505                         KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
 506                         btrfs_ino(inode));
 507                 return 0;
 508         }
 509         /*
 510          * Special check for subpage.
 511          *
 512          * We lock the full page then run each delalloc range in the page, thus
 513          * for the following case, we will hit some subpage specific corner case:
 514          *
 515          * 0            32K             64K
 516          * |    |///////|       |///////|
 517          *              \- A            \- B
 518          *
 519          * In above case, both range A and range B will try to unlock the full
 520          * page [0, 64K), causing the one finished later will have page
 521          * unlocked already, triggering various page lock requirement BUG_ON()s.
 522          *
 523          * So here we add an artificial limit that subpage compression can only
 524          * if the range is fully page aligned.
 525          *
 526          * In theory we only need to ensure the first page is fully covered, but
 527          * the tailing partial page will be locked until the full compression
 528          * finishes, delaying the write of other range.
 529          *
 530          * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
 531          * first to prevent any submitted async extent to unlock the full page.
 532          * By this, we can ensure for subpage case that only the last async_cow
 533          * will unlock the full page.
 534          */
 535         if (fs_info->sectorsize < PAGE_SIZE) {
 536                 if (!IS_ALIGNED(start, PAGE_SIZE) ||
 537                     !IS_ALIGNED(end + 1, PAGE_SIZE))
 538                         return 0;
 539         }
 540
 541         /* force compress */
 542         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 543                 return 1;
 544         /* defrag ioctl */
 545         if (inode->defrag_compress)
 546                 return 1;
 547         /* bad compression ratios */
 548         if (inode->flags & BTRFS_INODE_NOCOMPRESS)
 549                 return 0;
 550         if (btrfs_test_opt(fs_info, COMPRESS) ||
 551             inode->flags & BTRFS_INODE_COMPRESS ||
 552             inode->prop_compress)
 553                 return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
 554         return 0;
 555 }
 556
 557 static inline void inode_should_defrag(struct btrfs_inode *inode,
 558                 u64 start, u64 end, u64 num_bytes, u32 small_write)
 559 {
 560         /* If this is a small write inside eof, kick off a defrag */
 561         if (num_bytes < small_write &&
 562             (start > 0 || end + 1 < inode->disk_i_size))
 563                 btrfs_add_inode_defrag(NULL, inode, small_write);
 564 }
 565
 566 /*
 567  * we create compressed extents in two phases.  The first
 568  * phase compresses a range of pages that have already been
 569  * locked (both pages and state bits are locked).
 570  *
 571  * This is done inside an ordered work queue, and the compression
 572  * is spread across many cpus.  The actual IO submission is step
 573  * two, and the ordered work queue takes care of making sure that
 574  * happens in the same order things were put onto the queue by
 575  * writepages and friends.
 576  *
 577  * If this code finds it can't get good compression, it puts an
 578  * entry onto the work queue to write the uncompressed bytes.  This
 579  * makes sure that both compressed inodes and uncompressed inodes
 580  * are written in the same order that the flusher thread sent them
 581  * down.
 582  */
 583 static noinline int compress_file_range(struct async_chunk *async_chunk)
 584 {
 585         struct inode *inode = async_chunk->inode;
 586         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 587         u64 blocksize = fs_info->sectorsize;
 588         u64 start = async_chunk->start;
 589         u64 end = async_chunk->end;
 590         u64 actual_end;
 591         u64 i_size;
 592         int ret = 0;
 593         struct page **pages = NULL;
 594         unsigned long nr_pages;
 595         unsigned long total_compressed = 0;
 596         unsigned long total_in = 0;
 597         int i;
 598         int will_compress;
 599         int compress_type = fs_info->compress_type;
 600         int compressed_extents = 0;
 601         int redirty = 0;
 602
 603         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
 604                         SZ_16K);
 605
 606         /*
 607          * We need to save i_size before now because it could change in between
 608          * us evaluating the size and assigning it.  This is because we lock and
 609          * unlock the page in truncate and fallocate, and then modify the i_size
 610          * later on.
 611          *
 612          * The barriers are to emulate READ_ONCE, remove that once i_size_read
 613          * does that for us.
 614          */
 615         barrier();
 616         i_size = i_size_read(inode);
 617         barrier();
 618         actual_end = min_t(u64, i_size, end + 1);
 619 again:
 620         will_compress = 0;
 621         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 622         nr_pages = min_t(unsigned long, nr_pages,
 623                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
 624
 625         /*
 626          * we don't want to send crud past the end of i_size through
 627          * compression, that's just a waste of CPU time.  So, if the
 628          * end of the file is before the start of our current
 629          * requested range of bytes, we bail out to the uncompressed
 630          * cleanup code that can deal with all of this.
 631          *
 632          * It isn't really the fastest way to fix things, but this is a
 633          * very uncommon corner.
 634          */
 635         if (actual_end <= start)
 636                 goto cleanup_and_bail_uncompressed;
 637
 638         total_compressed = actual_end - start;
 639
 640         /*
 641          * Skip compression for a small file range(<=blocksize) that
 642          * isn't an inline extent, since it doesn't save disk space at all.
 643          */
 644         if (total_compressed <= blocksize &&
 645            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 646                 goto cleanup_and_bail_uncompressed;
 647
 648         /*
 649          * For subpage case, we require full page alignment for the sector
 650          * aligned range.
 651          * Thus we must also check against @actual_end, not just @end.
 652          */
 653         if (blocksize < PAGE_SIZE) {
 654                 if (!IS_ALIGNED(start, PAGE_SIZE) ||
 655                     !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
 656                         goto cleanup_and_bail_uncompressed;
 657         }
 658
 659         total_compressed = min_t(unsigned long, total_compressed,
 660                         BTRFS_MAX_UNCOMPRESSED);
 661         total_in = 0;
 662         ret = 0;
 663
 664         /*
 665          * we do compression for mount -o compress and when the
 666          * inode has not been flagged as nocompress.  This flag can
 667          * change at any time if we discover bad compression ratios.
 668          */
 669         if (inode_need_compress(BTRFS_I(inode), start, end)) {
 670                 WARN_ON(pages);
 671                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 672                 if (!pages) {
 673                         /* just bail out to the uncompressed code */
 674                         nr_pages = 0;
 675                         goto cont;
 676                 }
 677
 678                 if (BTRFS_I(inode)->defrag_compress)
 679                         compress_type = BTRFS_I(inode)->defrag_compress;
 680                 else if (BTRFS_I(inode)->prop_compress)
 681                         compress_type = BTRFS_I(inode)->prop_compress;
 682
 683                 /*
 684                  * we need to call clear_page_dirty_for_io on each
 685                  * page in the range.  Otherwise applications with the file
 686                  * mmap'd can wander in and change the page contents while
 687                  * we are compressing them.
 688                  *
 689                  * If the compression fails for any reason, we set the pages
 690                  * dirty again later on.
 691                  *
 692                  * Note that the remaining part is redirtied, the start pointer
 693                  * has moved, the end is the original one.
 694                  */
 695                 if (!redirty) {
 696                         extent_range_clear_dirty_for_io(inode, start, end);
 697                         redirty = 1;
 698                 }
 699
 700                 /* Compression level is applied here and only here */
 701                 ret = btrfs_compress_pages(
 702                         compress_type | (fs_info->compress_level << 4),
 703                                            inode->i_mapping, start,
 704                                            pages,
 705                                            &nr_pages,
 706                                            &total_in,
 707                                            &total_compressed);
 708
 709                 if (!ret) {
 710                         unsigned long offset = offset_in_page(total_compressed);
 711                         struct page *page = pages[nr_pages - 1];
 712
 713                         /* zero the tail end of the last page, we might be
 714                          * sending it down to disk
 715                          */
 716                         if (offset)
 717                                 memzero_page(page, offset, PAGE_SIZE - offset);
 718                         will_compress = 1;
 719                 }
 720         }
 721 cont:
 722         /*
 723          * Check cow_file_range() for why we don't even try to create inline
 724          * extent for subpage case.
 725          */
 726         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
 727                 /* lets try to make an inline extent */
 728                 if (ret || total_in < actual_end) {
 729                         /* we didn't compress the entire range, try
 730                          * to make an uncompressed inline extent.
 731                          */
 732                         ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
 733                                                     0, BTRFS_COMPRESS_NONE,
 734                                                     NULL, false);
 735                 } else {
 736                         /* try making a compressed inline extent */
 737                         ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
 738                                                     total_compressed,
 739                                                     compress_type, pages,
 740                                                     false);
 741                 }
 742                 if (ret <= 0) {
 743                         unsigned long clear_flags = EXTENT_DELALLOC |
 744                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 745                                 EXTENT_DO_ACCOUNTING;
 746                         unsigned long page_error_op;
 747
 748                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 749
 750                         /*
 751                          * inline extent creation worked or returned error,
 752                          * we don't need to create any more async work items.
 753                          * Unlock and free up our temp pages.
 754                          *
 755                          * We use DO_ACCOUNTING here because we need the
 756                          * delalloc_release_metadata to be done _after_ we drop
 757                          * our outstanding extent for clearing delalloc for this
 758                          * range.
 759                          */
 760                         extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
 761                                                      NULL,
 762                                                      clear_flags,
 763                                                      PAGE_UNLOCK |
 764                                                      PAGE_START_WRITEBACK |
 765                                                      page_error_op |
 766                                                      PAGE_END_WRITEBACK);
 767
 768                         /*
 769                          * Ensure we only free the compressed pages if we have
 770                          * them allocated, as we can still reach here with
 771                          * inode_need_compress() == false.
 772                          */
 773                         if (pages) {
 774                                 for (i = 0; i < nr_pages; i++) {
 775                                         WARN_ON(pages[i]->mapping);
 776                                         put_page(pages[i]);
 777                                 }
 778                                 kfree(pages);
 779                         }
 780                         return 0;
 781                 }
 782         }
 783
 784         if (will_compress) {
 785                 /*
 786                  * we aren't doing an inline extent round the compressed size
 787                  * up to a block size boundary so the allocator does sane
 788                  * things
 789                  */
 790                 total_compressed = ALIGN(total_compressed, blocksize);
 791
 792                 /*
 793                  * one last check to make sure the compression is really a
 794                  * win, compare the page count read with the blocks on disk,
 795                  * compression must free at least one sector size
 796                  */
 797                 total_in = round_up(total_in, fs_info->sectorsize);
 798                 if (total_compressed + blocksize <= total_in) {
 799                         compressed_extents++;
 800
 801                         /*
 802                          * The async work queues will take care of doing actual
 803                          * allocation on disk for these compressed pages, and
 804                          * will submit them to the elevator.
 805                          */
 806                         add_async_extent(async_chunk, start, total_in,
 807                                         total_compressed, pages, nr_pages,
 808                                         compress_type);
 809
 810                         if (start + total_in < end) {
 811                                 start += total_in;
 812                                 pages = NULL;
 813                                 cond_resched();
 814                                 goto again;
 815                         }
 816                         return compressed_extents;
 817                 }
 818         }
 819         if (pages) {
 820                 /*
 821                  * the compression code ran but failed to make things smaller,
 822                  * free any pages it allocated and our page pointer array
 823                  */
 824                 for (i = 0; i < nr_pages; i++) {
 825                         WARN_ON(pages[i]->mapping);
 826                         put_page(pages[i]);
 827                 }
 828                 kfree(pages);
 829                 pages = NULL;
 830                 total_compressed = 0;
 831                 nr_pages = 0;
 832
 833                 /* flag the file so we don't compress in the future */
 834                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
 835                     !(BTRFS_I(inode)->prop_compress)) {
 836                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 837                 }
 838         }
 839 cleanup_and_bail_uncompressed:
 840         /*
 841          * No compression, but we still need to write the pages in the file
 842          * we've been given so far.  redirty the locked page if it corresponds
 843          * to our extent and set things up for the async work queue to run
 844          * cow_file_range to do the normal delalloc dance.
 845          */
 846         if (async_chunk->locked_page &&
 847             (page_offset(async_chunk->locked_page) >= start &&
 848              page_offset(async_chunk->locked_page)) <= end) {
 849                 __set_page_dirty_nobuffers(async_chunk->locked_page);
 850                 /* unlocked later on in the async handlers */
 851         }
 852
 853         if (redirty)
 854                 extent_range_redirty_for_io(inode, start, end);
 855         add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 856                          BTRFS_COMPRESS_NONE);
 857         compressed_extents++;
 858
 859         return compressed_extents;
 860 }
 861
 862 static void free_async_extent_pages(struct async_extent *async_extent)
 863 {
 864         int i;
 865
 866         if (!async_extent->pages)
 867                 return;
 868
 869         for (i = 0; i < async_extent->nr_pages; i++) {
 870                 WARN_ON(async_extent->pages[i]->mapping);
 871                 put_page(async_extent->pages[i]);
 872         }
 873         kfree(async_extent->pages);
 874         async_extent->nr_pages = 0;
 875         async_extent->pages = NULL;
 876 }
 877
 878 static int submit_uncompressed_range(struct btrfs_inode *inode,
 879                                      struct async_extent *async_extent,
 880                                      struct page *locked_page)
 881 {
 882         u64 start = async_extent->start;
 883         u64 end = async_extent->start + async_extent->ram_size - 1;
 884         unsigned long nr_written = 0;
 885         int page_started = 0;
 886         int ret;
 887
 888         /*
 889          * Call cow_file_range() to run the delalloc range directly, since we
 890          * won't go to NOCOW or async path again.
 891          *
 892          * Also we call cow_file_range() with @unlock_page == 0, so that we
 893          * can directly submit them without interruption.
 894          */
 895         ret = cow_file_range(inode, locked_page, start, end, &page_started,
 896                              &nr_written, 0);
 897         /* Inline extent inserted, page gets unlocked and everything is done */
 898         if (page_started) {
 899                 ret = 0;
 900                 goto out;
 901         }
 902         if (ret < 0) {
 903                 if (locked_page)
 904                         unlock_page(locked_page);
 905                 goto out;
 906         }
 907
 908         ret = extent_write_locked_range(&inode->vfs_inode, start, end);
 909         /* All pages will be unlocked, including @locked_page */
 910 out:
 911         kfree(async_extent);
 912         return ret;
 913 }
 914
 915 static int submit_one_async_extent(struct btrfs_inode *inode,
 916                                    struct async_chunk *async_chunk,
 917                                    struct async_extent *async_extent,
 918                                    u64 *alloc_hint)
 919 {
 920         struct extent_io_tree *io_tree = &inode->io_tree;
 921         struct btrfs_root *root = inode->root;
 922         struct btrfs_fs_info *fs_info = root->fs_info;
 923         struct btrfs_key ins;
 924         struct page *locked_page = NULL;
 925         struct extent_map *em;
 926         int ret = 0;
 927         u64 start = async_extent->start;
 928         u64 end = async_extent->start + async_extent->ram_size - 1;
 929
 930         /*
 931          * If async_chunk->locked_page is in the async_extent range, we need to
 932          * handle it.
 933          */
 934         if (async_chunk->locked_page) {
 935                 u64 locked_page_start = page_offset(async_chunk->locked_page);
 936                 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
 937
 938                 if (!(start >= locked_page_end || end <= locked_page_start))
 939                         locked_page = async_chunk->locked_page;
 940         }
 941         lock_extent(io_tree, start, end);
 942
 943         /* We have fall back to uncompressed write */
 944         if (!async_extent->pages)
 945                 return submit_uncompressed_range(inode, async_extent, locked_page);
 946
 947         ret = btrfs_reserve_extent(root, async_extent->ram_size,
 948                                    async_extent->compressed_size,
 949                                    async_extent->compressed_size,
 950                                    0, *alloc_hint, &ins, 1, 1);
 951         if (ret) {
 952                 free_async_extent_pages(async_extent);
 953                 /*
 954                  * Here we used to try again by going back to non-compressed
 955                  * path for ENOSPC.  But we can't reserve space even for
 956                  * compressed size, how could it work for uncompressed size
 957                  * which requires larger size?  So here we directly go error
 958                  * path.
 959                  */
 960                 goto out_free;
 961         }
 962
 963         /* Here we're doing allocation and writeback of the compressed pages */
 964         em = create_io_em(inode, start,
 965                           async_extent->ram_size,       /* len */
 966                           start,                        /* orig_start */
 967                           ins.objectid,                 /* block_start */
 968                           ins.offset,                   /* block_len */
 969                           ins.offset,                   /* orig_block_len */
 970                           async_extent->ram_size,       /* ram_bytes */
 971                           async_extent->compress_type,
 972                           BTRFS_ORDERED_COMPRESSED);
 973         if (IS_ERR(em)) {
 974                 ret = PTR_ERR(em);
 975                 goto out_free_reserve;
 976         }
 977         free_extent_map(em);
 978
 979         ret = btrfs_add_ordered_extent(inode, start,            /* file_offset */
 980                                        async_extent->ram_size,  /* num_bytes */
 981                                        async_extent->ram_size,  /* ram_bytes */
 982                                        ins.objectid,            /* disk_bytenr */
 983                                        ins.offset,              /* disk_num_bytes */
 984                                        0,                       /* offset */
 985                                        1 << BTRFS_ORDERED_COMPRESSED,
 986                                        async_extent->compress_type);
 987         if (ret) {
 988                 btrfs_drop_extent_cache(inode, start, end, 0);
 989                 goto out_free_reserve;
 990         }
 991         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 992
 993         /* Clear dirty, set writeback and unlock the pages. */
 994         extent_clear_unlock_delalloc(inode, start, end,
 995                         NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 996                         PAGE_UNLOCK | PAGE_START_WRITEBACK);
 997         if (btrfs_submit_compressed_write(inode, start, /* file_offset */
 998                             async_extent->ram_size,     /* num_bytes */
 999                             ins.objectid,               /* disk_bytenr */
1000                             ins.offset,                 /* compressed_len */
1001                             async_extent->pages,        /* compressed_pages */
1002                             async_extent->nr_pages,
1003                             async_chunk->write_flags,
1004                             async_chunk->blkcg_css, true)) {
1005                 const u64 start = async_extent->start;
1006                 const u64 end = start + async_extent->ram_size - 1;
1007
1008                 btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
1009
1010                 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
1011                                              PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1012                 free_async_extent_pages(async_extent);
1013         }
1014         *alloc_hint = ins.objectid + ins.offset;
1015         kfree(async_extent);
1016         return ret;
1017
1018 out_free_reserve:
1019         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1020         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1021 out_free:
1022         extent_clear_unlock_delalloc(inode, start, end,
1023                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1024                                      EXTENT_DELALLOC_NEW |
1025                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1026                                      PAGE_UNLOCK | PAGE_START_WRITEBACK |
1027                                      PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1028         free_async_extent_pages(async_extent);
1029         kfree(async_extent);
1030         return ret;
1031 }
1032
1033 /*
1034  * Phase two of compressed writeback.  This is the ordered portion of the code,
1035  * which only gets called in the order the work was queued.  We walk all the
1036  * async extents created by compress_file_range and send them down to the disk.
1037  */
1038 static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
1039 {
1040         struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
1041         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1042         struct async_extent *async_extent;
1043         u64 alloc_hint = 0;
1044         int ret = 0;
1045
1046         while (!list_empty(&async_chunk->extents)) {
1047                 u64 extent_start;
1048                 u64 ram_size;
1049
1050                 async_extent = list_entry(async_chunk->extents.next,
1051                                           struct async_extent, list);
1052                 list_del(&async_extent->list);
1053                 extent_start = async_extent->start;
1054                 ram_size = async_extent->ram_size;
1055
1056                 ret = submit_one_async_extent(inode, async_chunk, async_extent,
1057                                               &alloc_hint);
1058                 btrfs_debug(fs_info,
1059 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1060                             inode->root->root_key.objectid,
1061                             btrfs_ino(inode), extent_start, ram_size, ret);
1062         }
1063 }
1064
1065 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1066                                       u64 num_bytes)
1067 {
1068         struct extent_map_tree *em_tree = &inode->extent_tree;
1069         struct extent_map *em;
1070         u64 alloc_hint = 0;
1071
1072         read_lock(&em_tree->lock);
1073         em = search_extent_mapping(em_tree, start, num_bytes);
1074         if (em) {
1075                 /*
1076                  * if block start isn't an actual block number then find the
1077                  * first block in this inode and use that as a hint.  If that
1078                  * block is also bogus then just don't worry about it.
1079                  */
1080                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1081                         free_extent_map(em);
1082                         em = search_extent_mapping(em_tree, 0, 0);
1083                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1084                                 alloc_hint = em->block_start;
1085                         if (em)
1086                                 free_extent_map(em);
1087                 } else {
1088                         alloc_hint = em->block_start;
1089                         free_extent_map(em);
1090                 }
1091         }
1092         read_unlock(&em_tree->lock);
1093
1094         return alloc_hint;
1095 }
1096
1097 /*
1098  * when extent_io.c finds a delayed allocation range in the file,
1099  * the call backs end up in this code.  The basic idea is to
1100  * allocate extents on disk for the range, and create ordered data structs
1101  * in ram to track those extents.
1102  *
1103  * locked_page is the page that writepage had locked already.  We use
1104  * it to make sure we don't do extra locks or unlocks.
1105  *
1106  * *page_started is set to one if we unlock locked_page and do everything
1107  * required to start IO on it.  It may be clean and already done with
1108  * IO when we return.
1109  */
1110 static noinline int cow_file_range(struct btrfs_inode *inode,
1111                                    struct page *locked_page,
1112                                    u64 start, u64 end, int *page_started,
1113                                    unsigned long *nr_written, int unlock)
1114 {
1115         struct btrfs_root *root = inode->root;
1116         struct btrfs_fs_info *fs_info = root->fs_info;
1117         u64 alloc_hint = 0;
1118         u64 num_bytes;
1119         unsigned long ram_size;
1120         u64 cur_alloc_size = 0;
1121         u64 min_alloc_size;
1122         u64 blocksize = fs_info->sectorsize;
1123         struct btrfs_key ins;
1124         struct extent_map *em;
1125         unsigned clear_bits;
1126         unsigned long page_ops;
1127         bool extent_reserved = false;
1128         int ret = 0;
1129
1130         if (btrfs_is_free_space_inode(inode)) {
1131                 ret = -EINVAL;
1132                 goto out_unlock;
1133         }
1134
1135         num_bytes = ALIGN(end - start + 1, blocksize);
1136         num_bytes = max(blocksize,  num_bytes);
1137         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1138
1139         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1140
1141         /*
1142          * Due to the page size limit, for subpage we can only trigger the
1143          * writeback for the dirty sectors of page, that means data writeback
1144          * is doing more writeback than what we want.
1145          *
1146          * This is especially unexpected for some call sites like fallocate,
1147          * where we only increase i_size after everything is done.
1148          * This means we can trigger inline extent even if we didn't want to.
1149          * So here we skip inline extent creation completely.
1150          */
1151         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
1152                 u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1153                                        end + 1);
1154
1155                 /* lets try to make an inline extent */
1156                 ret = cow_file_range_inline(inode, actual_end, 0,
1157                                             BTRFS_COMPRESS_NONE, NULL, false);
1158                 if (ret == 0) {
1159                         /*
1160                          * We use DO_ACCOUNTING here because we need the
1161                          * delalloc_release_metadata to be run _after_ we drop
1162                          * our outstanding extent for clearing delalloc for this
1163                          * range.
1164                          */
1165                         extent_clear_unlock_delalloc(inode, start, end,
1166                                      locked_page,
1167                                      EXTENT_LOCKED | EXTENT_DELALLOC |
1168                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1169                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1170                                      PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1171                         *nr_written = *nr_written +
1172                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1173                         *page_started = 1;
1174                         /*
1175                          * locked_page is locked by the caller of
1176                          * writepage_delalloc(), not locked by
1177                          * __process_pages_contig().
1178                          *
1179                          * We can't let __process_pages_contig() to unlock it,
1180                          * as it doesn't have any subpage::writers recorded.
1181                          *
1182                          * Here we manually unlock the page, since the caller
1183                          * can't use page_started to determine if it's an
1184                          * inline extent or a compressed extent.
1185                          */
1186                         unlock_page(locked_page);
1187                         goto out;
1188                 } else if (ret < 0) {
1189                         goto out_unlock;
1190                 }
1191         }
1192
1193         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1194         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
1195
1196         /*
1197          * Relocation relies on the relocated extents to have exactly the same
1198          * size as the original extents. Normally writeback for relocation data
1199          * extents follows a NOCOW path because relocation preallocates the
1200          * extents. However, due to an operation such as scrub turning a block
1201          * group to RO mode, it may fallback to COW mode, so we must make sure
1202          * an extent allocated during COW has exactly the requested size and can
1203          * not be split into smaller extents, otherwise relocation breaks and
1204          * fails during the stage where it updates the bytenr of file extent
1205          * items.
1206          */
1207         if (btrfs_is_data_reloc_root(root))
1208                 min_alloc_size = num_bytes;
1209         else
1210                 min_alloc_size = fs_info->sectorsize;
1211
1212         while (num_bytes > 0) {
1213                 cur_alloc_size = num_bytes;
1214                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1215                                            min_alloc_size, 0, alloc_hint,
1216                                            &ins, 1, 1);
1217                 if (ret < 0)
1218                         goto out_unlock;
1219                 cur_alloc_size = ins.offset;
1220                 extent_reserved = true;
1221
1222                 ram_size = ins.offset;
1223                 em = create_io_em(inode, start, ins.offset, /* len */
1224                                   start, /* orig_start */
1225                                   ins.objectid, /* block_start */
1226                                   ins.offset, /* block_len */
1227                                   ins.offset, /* orig_block_len */
1228                                   ram_size, /* ram_bytes */
1229                                   BTRFS_COMPRESS_NONE, /* compress_type */
1230                                   BTRFS_ORDERED_REGULAR /* type */);
1231                 if (IS_ERR(em)) {
1232                         ret = PTR_ERR(em);
1233                         goto out_reserve;
1234                 }
1235                 free_extent_map(em);
1236
1237                 ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
1238                                                ins.objectid, cur_alloc_size, 0,
1239                                                1 << BTRFS_ORDERED_REGULAR,
1240                                                BTRFS_COMPRESS_NONE);
1241                 if (ret)
1242                         goto out_drop_extent_cache;
1243
1244                 if (btrfs_is_data_reloc_root(root)) {
1245                         ret = btrfs_reloc_clone_csums(inode, start,
1246                                                       cur_alloc_size);
1247                         /*
1248                          * Only drop cache here, and process as normal.
1249                          *
1250                          * We must not allow extent_clear_unlock_delalloc()
1251                          * at out_unlock label to free meta of this ordered
1252                          * extent, as its meta should be freed by
1253                          * btrfs_finish_ordered_io().
1254                          *
1255                          * So we must continue until @start is increased to
1256                          * skip current ordered extent.
1257                          */
1258                         if (ret)
1259                                 btrfs_drop_extent_cache(inode, start,
1260                                                 start + ram_size - 1, 0);
1261                 }
1262
1263                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1264
1265                 /*
1266                  * We're not doing compressed IO, don't unlock the first page
1267                  * (which the caller expects to stay locked), don't clear any
1268                  * dirty bits and don't set any writeback bits
1269                  *
1270                  * Do set the Ordered (Private2) bit so we know this page was
1271                  * properly setup for writepage.
1272                  */
1273                 page_ops = unlock ? PAGE_UNLOCK : 0;
1274                 page_ops |= PAGE_SET_ORDERED;
1275
1276                 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1277                                              locked_page,
1278                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1279                                              page_ops);
1280                 if (num_bytes < cur_alloc_size)
1281                         num_bytes = 0;
1282                 else
1283                         num_bytes -= cur_alloc_size;
1284                 alloc_hint = ins.objectid + ins.offset;
1285                 start += cur_alloc_size;
1286                 extent_reserved = false;
1287
1288                 /*
1289                  * btrfs_reloc_clone_csums() error, since start is increased
1290                  * extent_clear_unlock_delalloc() at out_unlock label won't
1291                  * free metadata of current ordered extent, we're OK to exit.
1292                  */
1293                 if (ret)
1294                         goto out_unlock;
1295         }
1296 out:
1297         return ret;
1298
1299 out_drop_extent_cache:
1300         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1301 out_reserve:
1302         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1303         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1304 out_unlock:
1305         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1306                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1307         page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1308         /*
1309          * If we reserved an extent for our delalloc range (or a subrange) and
1310          * failed to create the respective ordered extent, then it means that
1311          * when we reserved the extent we decremented the extent's size from
1312          * the data space_info's bytes_may_use counter and incremented the
1313          * space_info's bytes_reserved counter by the same amount. We must make
1314          * sure extent_clear_unlock_delalloc() does not try to decrement again
1315          * the data space_info's bytes_may_use counter, therefore we do not pass
1316          * it the flag EXTENT_CLEAR_DATA_RESV.
1317          */
1318         if (extent_reserved) {
1319                 extent_clear_unlock_delalloc(inode, start,
1320                                              start + cur_alloc_size - 1,
1321                                              locked_page,
1322                                              clear_bits,
1323                                              page_ops);
1324                 start += cur_alloc_size;
1325                 if (start >= end)
1326                         goto out;
1327         }
1328         extent_clear_unlock_delalloc(inode, start, end, locked_page,
1329                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1330                                      page_ops);
1331         goto out;
1332 }
1333
1334 /*
1335  * work queue call back to started compression on a file and pages
1336  */
1337 static noinline void async_cow_start(struct btrfs_work *work)
1338 {
1339         struct async_chunk *async_chunk;
1340         int compressed_extents;
1341
1342         async_chunk = container_of(work, struct async_chunk, work);
1343
1344         compressed_extents = compress_file_range(async_chunk);
1345         if (compressed_extents == 0) {
1346                 btrfs_add_delayed_iput(async_chunk->inode);
1347                 async_chunk->inode = NULL;
1348         }
1349 }
1350
1351 /*
1352  * work queue call back to submit previously compressed pages
1353  */
1354 static noinline void async_cow_submit(struct btrfs_work *work)
1355 {
1356         struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1357                                                      work);
1358         struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1359         unsigned long nr_pages;
1360
1361         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1362                 PAGE_SHIFT;
1363
1364         /*
1365          * ->inode could be NULL if async_chunk_start has failed to compress,
1366          * in which case we don't have anything to submit, yet we need to
1367          * always adjust ->async_delalloc_pages as its paired with the init
1368          * happening in cow_file_range_async
1369          */
1370         if (async_chunk->inode)
1371                 submit_compressed_extents(async_chunk);
1372
1373         /* atomic_sub_return implies a barrier */
1374         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1375             5 * SZ_1M)
1376                 cond_wake_up_nomb(&fs_info->async_submit_wait);
1377 }
1378
1379 static noinline void async_cow_free(struct btrfs_work *work)
1380 {
1381         struct async_chunk *async_chunk;
1382         struct async_cow *async_cow;
1383
1384         async_chunk = container_of(work, struct async_chunk, work);
1385         if (async_chunk->inode)
1386                 btrfs_add_delayed_iput(async_chunk->inode);
1387         if (async_chunk->blkcg_css)
1388                 css_put(async_chunk->blkcg_css);
1389
1390         async_cow = async_chunk->async_cow;
1391         if (atomic_dec_and_test(&async_cow->num_chunks))
1392                 kvfree(async_cow);
1393 }
1394
1395 static int cow_file_range_async(struct btrfs_inode *inode,
1396                                 struct writeback_control *wbc,
1397                                 struct page *locked_page,
1398                                 u64 start, u64 end, int *page_started,
1399                                 unsigned long *nr_written)
1400 {
1401         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1402         struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1403         struct async_cow *ctx;
1404         struct async_chunk *async_chunk;
1405         unsigned long nr_pages;
1406         u64 cur_end;
1407         u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1408         int i;
1409         bool should_compress;
1410         unsigned nofs_flag;
1411         const unsigned int write_flags = wbc_to_write_flags(wbc);
1412
1413         unlock_extent(&inode->io_tree, start, end);
1414
1415         if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
1416             !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1417                 num_chunks = 1;
1418                 should_compress = false;
1419         } else {
1420                 should_compress = true;
1421         }
1422
1423         nofs_flag = memalloc_nofs_save();
1424         ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1425         memalloc_nofs_restore(nofs_flag);
1426
1427         if (!ctx) {
1428                 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1429                         EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1430                         EXTENT_DO_ACCOUNTING;
1431                 unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
1432                                          PAGE_END_WRITEBACK | PAGE_SET_ERROR;
1433
1434                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1435                                              clear_bits, page_ops);
1436                 return -ENOMEM;
1437         }
1438
1439         async_chunk = ctx->chunks;
1440         atomic_set(&ctx->num_chunks, num_chunks);
1441
1442         for (i = 0; i < num_chunks; i++) {
1443                 if (should_compress)
1444                         cur_end = min(end, start + SZ_512K - 1);
1445                 else
1446                         cur_end = end;
1447
1448                 /*
1449                  * igrab is called higher up in the call chain, take only the
1450                  * lightweight reference for the callback lifetime
1451                  */
1452                 ihold(&inode->vfs_inode);
1453                 async_chunk[i].async_cow = ctx;
1454                 async_chunk[i].inode = &inode->vfs_inode;
1455                 async_chunk[i].start = start;
1456                 async_chunk[i].end = cur_end;
1457                 async_chunk[i].write_flags = write_flags;
1458                 INIT_LIST_HEAD(&async_chunk[i].extents);
1459
1460                 /*
1461                  * The locked_page comes all the way from writepage and its
1462                  * the original page we were actually given.  As we spread
1463                  * this large delalloc region across multiple async_chunk
1464                  * structs, only the first struct needs a pointer to locked_page
1465                  *
1466                  * This way we don't need racey decisions about who is supposed
1467                  * to unlock it.
1468                  */
1469                 if (locked_page) {
1470                         /*
1471                          * Depending on the compressibility, the pages might or
1472                          * might not go through async.  We want all of them to
1473                          * be accounted against wbc once.  Let's do it here
1474                          * before the paths diverge.  wbc accounting is used
1475                          * only for foreign writeback detection and doesn't
1476                          * need full accuracy.  Just account the whole thing
1477                          * against the first page.
1478                          */
1479                         wbc_account_cgroup_owner(wbc, locked_page,
1480                                                  cur_end - start);
1481                         async_chunk[i].locked_page = locked_page;
1482                         locked_page = NULL;
1483                 } else {
1484                         async_chunk[i].locked_page = NULL;
1485                 }
1486
1487                 if (blkcg_css != blkcg_root_css) {
1488                         css_get(blkcg_css);
1489                         async_chunk[i].blkcg_css = blkcg_css;
1490                 } else {
1491                         async_chunk[i].blkcg_css = NULL;
1492                 }
1493
1494                 btrfs_init_work(&async_chunk[i].work, async_cow_start,
1495                                 async_cow_submit, async_cow_free);
1496
1497                 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1498                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1499
1500                 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1501
1502                 *nr_written += nr_pages;
1503                 start = cur_end + 1;
1504         }
1505         *page_started = 1;
1506         return 0;
1507 }
1508
1509 static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
1510                                        struct page *locked_page, u64 start,
1511                                        u64 end, int *page_started,
1512                                        unsigned long *nr_written)
1513 {
1514         int ret;
1515
1516         ret = cow_file_range(inode, locked_page, start, end, page_started,
1517                              nr_written, 0);
1518         if (ret)
1519                 return ret;
1520
1521         if (*page_started)
1522                 return 0;
1523
1524         __set_page_dirty_nobuffers(locked_page);
1525         account_page_redirty(locked_page);
1526         extent_write_locked_range(&inode->vfs_inode, start, end);
1527         *page_started = 1;
1528
1529         return 0;
1530 }
1531
1532 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1533                                         u64 bytenr, u64 num_bytes)
1534 {
1535         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1536         struct btrfs_ordered_sum *sums;
1537         int ret;
1538         LIST_HEAD(list);
1539
1540         ret = btrfs_lookup_csums_range(csum_root, bytenr,
1541                                        bytenr + num_bytes - 1, &list, 0);
1542         if (ret == 0 && list_empty(&list))
1543                 return 0;
1544
1545         while (!list_empty(&list)) {
1546                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1547                 list_del(&sums->list);
1548                 kfree(sums);
1549         }
1550         if (ret < 0)
1551                 return ret;
1552         return 1;
1553 }
1554
1555 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1556                            const u64 start, const u64 end,
1557                            int *page_started, unsigned long *nr_written)
1558 {
1559         const bool is_space_ino = btrfs_is_free_space_inode(inode);
1560         const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1561         const u64 range_bytes = end + 1 - start;
1562         struct extent_io_tree *io_tree = &inode->io_tree;
1563         u64 range_start = start;
1564         u64 count;
1565
1566         /*
1567          * If EXTENT_NORESERVE is set it means that when the buffered write was
1568          * made we had not enough available data space and therefore we did not
1569          * reserve data space for it, since we though we could do NOCOW for the
1570          * respective file range (either there is prealloc extent or the inode
1571          * has the NOCOW bit set).
1572          *
1573          * However when we need to fallback to COW mode (because for example the
1574          * block group for the corresponding extent was turned to RO mode by a
1575          * scrub or relocation) we need to do the following:
1576          *
1577          * 1) We increment the bytes_may_use counter of the data space info.
1578          *    If COW succeeds, it allocates a new data extent and after doing
1579          *    that it decrements the space info's bytes_may_use counter and
1580          *    increments its bytes_reserved counter by the same amount (we do
1581          *    this at btrfs_add_reserved_bytes()). So we need to increment the
1582          *    bytes_may_use counter to compensate (when space is reserved at
1583          *    buffered write time, the bytes_may_use counter is incremented);
1584          *
1585          * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1586          *    that if the COW path fails for any reason, it decrements (through
1587          *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1588          *    data space info, which we incremented in the step above.
1589          *
1590          * If we need to fallback to cow and the inode corresponds to a free
1591          * space cache inode or an inode of the data relocation tree, we must
1592          * also increment bytes_may_use of the data space_info for the same
1593          * reason. Space caches and relocated data extents always get a prealloc
1594          * extent for them, however scrub or balance may have set the block
1595          * group that contains that extent to RO mode and therefore force COW
1596          * when starting writeback.
1597          */
1598         count = count_range_bits(io_tree, &range_start, end, range_bytes,
1599                                  EXTENT_NORESERVE, 0);
1600         if (count > 0 || is_space_ino || is_reloc_ino) {
1601                 u64 bytes = count;
1602                 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1603                 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1604
1605                 if (is_space_ino || is_reloc_ino)
1606                         bytes = range_bytes;
1607
1608                 spin_lock(&sinfo->lock);
1609                 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1610                 spin_unlock(&sinfo->lock);
1611
1612                 if (count > 0)
1613                         clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1614                                          0, 0, NULL);
1615         }
1616
1617         return cow_file_range(inode, locked_page, start, end, page_started,
1618                               nr_written, 1);
1619 }
1620
1621 /*
1622  * when nowcow writeback call back.  This checks for snapshots or COW copies
1623  * of the extents that exist in the file, and COWs the file as required.
1624  *
1625  * If no cow copies or snapshots exist, we write directly to the existing
1626  * blocks on disk
1627  */
1628 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1629                                        struct page *locked_page,
1630                                        const u64 start, const u64 end,
1631                                        int *page_started,
1632                                        unsigned long *nr_written)
1633 {
1634         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1635         struct btrfs_root *root = inode->root;
1636         struct btrfs_path *path;
1637         u64 cow_start = (u64)-1;
1638         u64 cur_offset = start;
1639         int ret;
1640         bool check_prev = true;
1641         const bool freespace_inode = btrfs_is_free_space_inode(inode);
1642         u64 ino = btrfs_ino(inode);
1643         bool nocow = false;
1644         u64 disk_bytenr = 0;
1645         const bool force = inode->flags & BTRFS_INODE_NODATACOW;
1646
1647         path = btrfs_alloc_path();
1648         if (!path) {
1649                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1650                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1651                                              EXTENT_DO_ACCOUNTING |
1652                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1653                                              PAGE_START_WRITEBACK |
1654                                              PAGE_END_WRITEBACK);
1655                 return -ENOMEM;
1656         }
1657
1658         while (1) {
1659                 struct btrfs_key found_key;
1660                 struct btrfs_file_extent_item *fi;
1661                 struct extent_buffer *leaf;
1662                 u64 extent_end;
1663                 u64 extent_offset;
1664                 u64 num_bytes = 0;
1665                 u64 disk_num_bytes;
1666                 u64 ram_bytes;
1667                 int extent_type;
1668
1669                 nocow = false;
1670
1671                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1672                                                cur_offset, 0);
1673                 if (ret < 0)
1674                         goto error;
1675
1676                 /*
1677                  * If there is no extent for our range when doing the initial
1678                  * search, then go back to the previous slot as it will be the
1679                  * one containing the search offset
1680                  */
1681                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1682                         leaf = path->nodes[0];
1683                         btrfs_item_key_to_cpu(leaf, &found_key,
1684                                               path->slots[0] - 1);
1685                         if (found_key.objectid == ino &&
1686                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1687                                 path->slots[0]--;
1688                 }
1689                 check_prev = false;
1690 next_slot:
1691                 /* Go to next leaf if we have exhausted the current one */
1692                 leaf = path->nodes[0];
1693                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1694                         ret = btrfs_next_leaf(root, path);
1695                         if (ret < 0) {
1696                                 if (cow_start != (u64)-1)
1697                                         cur_offset = cow_start;
1698                                 goto error;
1699                         }
1700                         if (ret > 0)
1701                                 break;
1702                         leaf = path->nodes[0];
1703                 }
1704
1705                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1706
1707                 /* Didn't find anything for our INO */
1708                 if (found_key.objectid > ino)
1709                         break;
1710                 /*
1711                  * Keep searching until we find an EXTENT_ITEM or there are no
1712                  * more extents for this inode
1713                  */
1714                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1715                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1716                         path->slots[0]++;
1717                         goto next_slot;
1718                 }
1719
1720                 /* Found key is not EXTENT_DATA_KEY or starts after req range */
1721                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1722                     found_key.offset > end)
1723                         break;
1724
1725                 /*
1726                  * If the found extent starts after requested offset, then
1727                  * adjust extent_end to be right before this extent begins
1728                  */
1729                 if (found_key.offset > cur_offset) {
1730                         extent_end = found_key.offset;
1731                         extent_type = 0;
1732                         goto out_check;
1733                 }
1734
1735                 /*
1736                  * Found extent which begins before our range and potentially
1737                  * intersect it
1738                  */
1739                 fi = btrfs_item_ptr(leaf, path->slots[0],
1740                                     struct btrfs_file_extent_item);
1741                 extent_type = btrfs_file_extent_type(leaf, fi);
1742
1743                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1744                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1745                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1746                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1747                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1748                         extent_end = found_key.offset +
1749                                 btrfs_file_extent_num_bytes(leaf, fi);
1750                         disk_num_bytes =
1751                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1752                         /*
1753                          * If the extent we got ends before our current offset,
1754                          * skip to the next extent.
1755                          */
1756                         if (extent_end <= cur_offset) {
1757                                 path->slots[0]++;
1758                                 goto next_slot;
1759                         }
1760                         /* Skip holes */
1761                         if (disk_bytenr == 0)
1762                                 goto out_check;
1763                         /* Skip compressed/encrypted/encoded extents */
1764                         if (btrfs_file_extent_compression(leaf, fi) ||
1765                             btrfs_file_extent_encryption(leaf, fi) ||
1766                             btrfs_file_extent_other_encoding(leaf, fi))
1767                                 goto out_check;
1768                         /*
1769                          * If extent is created before the last volume's snapshot
1770                          * this implies the extent is shared, hence we can't do
1771                          * nocow. This is the same check as in
1772                          * btrfs_cross_ref_exist but without calling
1773                          * btrfs_search_slot.
1774                          */
1775                         if (!freespace_inode &&
1776                             btrfs_file_extent_generation(leaf, fi) <=
1777                             btrfs_root_last_snapshot(&root->root_item))
1778                                 goto out_check;
1779                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1780                                 goto out_check;
1781
1782                         /*
1783                          * The following checks can be expensive, as they need to
1784                          * take other locks and do btree or rbtree searches, so
1785                          * release the path to avoid blocking other tasks for too
1786                          * long.
1787                          */
1788                         btrfs_release_path(path);
1789
1790                         ret = btrfs_cross_ref_exist(root, ino,
1791                                                     found_key.offset -
1792                                                     extent_offset, disk_bytenr, false);
1793                         if (ret) {
1794                                 /*
1795                                  * ret could be -EIO if the above fails to read
1796                                  * metadata.
1797                                  */
1798                                 if (ret < 0) {
1799                                         if (cow_start != (u64)-1)
1800                                                 cur_offset = cow_start;
1801                                         goto error;
1802                                 }
1803
1804                                 WARN_ON_ONCE(freespace_inode);
1805                                 goto out_check;
1806                         }
1807                         disk_bytenr += extent_offset;
1808                         disk_bytenr += cur_offset - found_key.offset;
1809                         num_bytes = min(end + 1, extent_end) - cur_offset;
1810                         /*
1811                          * If there are pending snapshots for this root, we
1812                          * fall into common COW way
1813                          */
1814                         if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1815                                 goto out_check;
1816                         /*
1817                          * force cow if csum exists in the range.
1818                          * this ensure that csum for a given extent are
1819                          * either valid or do not exist.
1820                          */
1821                         ret = csum_exist_in_range(fs_info, disk_bytenr,
1822                                                   num_bytes);
1823                         if (ret) {
1824                                 /*
1825                                  * ret could be -EIO if the above fails to read
1826                                  * metadata.
1827                                  */
1828                                 if (ret < 0) {
1829                                         if (cow_start != (u64)-1)
1830                                                 cur_offset = cow_start;
1831                                         goto error;
1832                                 }
1833                                 WARN_ON_ONCE(freespace_inode);
1834                                 goto out_check;
1835                         }
1836                         /* If the extent's block group is RO, we must COW */
1837                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1838                                 goto out_check;
1839                         nocow = true;
1840                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1841                         extent_end = found_key.offset + ram_bytes;
1842                         extent_end = ALIGN(extent_end, fs_info->sectorsize);
1843                         /* Skip extents outside of our requested range */
1844                         if (extent_end <= start) {
1845                                 path->slots[0]++;
1846                                 goto next_slot;
1847                         }
1848                 } else {
1849                         /* If this triggers then we have a memory corruption */
1850                         BUG();
1851                 }
1852 out_check:
1853                 /*
1854                  * If nocow is false then record the beginning of the range
1855                  * that needs to be COWed
1856                  */
1857                 if (!nocow) {
1858                         if (cow_start == (u64)-1)
1859                                 cow_start = cur_offset;
1860                         cur_offset = extent_end;
1861                         if (cur_offset > end)
1862                                 break;
1863                         if (!path->nodes[0])
1864                                 continue;
1865                         path->slots[0]++;
1866                         goto next_slot;
1867                 }
1868
1869                 /*
1870                  * COW range from cow_start to found_key.offset - 1. As the key
1871                  * will contain the beginning of the first extent that can be
1872                  * NOCOW, following one which needs to be COW'ed
1873                  */
1874                 if (cow_start != (u64)-1) {
1875                         ret = fallback_to_cow(inode, locked_page,
1876                                               cow_start, found_key.offset - 1,
1877                                               page_started, nr_written);
1878                         if (ret)
1879                                 goto error;
1880                         cow_start = (u64)-1;
1881                 }
1882
1883                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1884                         u64 orig_start = found_key.offset - extent_offset;
1885                         struct extent_map *em;
1886
1887                         em = create_io_em(inode, cur_offset, num_bytes,
1888                                           orig_start,
1889                                           disk_bytenr, /* block_start */
1890                                           num_bytes, /* block_len */
1891                                           disk_num_bytes, /* orig_block_len */
1892                                           ram_bytes, BTRFS_COMPRESS_NONE,
1893                                           BTRFS_ORDERED_PREALLOC);
1894                         if (IS_ERR(em)) {
1895                                 ret = PTR_ERR(em);
1896                                 goto error;
1897                         }
1898                         free_extent_map(em);
1899                         ret = btrfs_add_ordered_extent(inode,
1900                                         cur_offset, num_bytes, num_bytes,
1901                                         disk_bytenr, num_bytes, 0,
1902                                         1 << BTRFS_ORDERED_PREALLOC,
1903                                         BTRFS_COMPRESS_NONE);
1904                         if (ret) {
1905                                 btrfs_drop_extent_cache(inode, cur_offset,
1906                                                         cur_offset + num_bytes - 1,
1907                                                         0);
1908                                 goto error;
1909                         }
1910                 } else {
1911                         ret = btrfs_add_ordered_extent(inode, cur_offset,
1912                                                        num_bytes, num_bytes,
1913                                                        disk_bytenr, num_bytes,
1914                                                        0,
1915                                                        1 << BTRFS_ORDERED_NOCOW,
1916                                                        BTRFS_COMPRESS_NONE);
1917                         if (ret)
1918                                 goto error;
1919                 }
1920
1921                 if (nocow)
1922                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1923                 nocow = false;
1924
1925                 if (btrfs_is_data_reloc_root(root))
1926                         /*
1927                          * Error handled later, as we must prevent
1928                          * extent_clear_unlock_delalloc() in error handler
1929                          * from freeing metadata of created ordered extent.
1930                          */
1931                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1932                                                       num_bytes);
1933
1934                 extent_clear_unlock_delalloc(inode, cur_offset,
1935                                              cur_offset + num_bytes - 1,
1936                                              locked_page, EXTENT_LOCKED |
1937                                              EXTENT_DELALLOC |
1938                                              EXTENT_CLEAR_DATA_RESV,
1939                                              PAGE_UNLOCK | PAGE_SET_ORDERED);
1940
1941                 cur_offset = extent_end;
1942
1943                 /*
1944                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1945                  * handler, as metadata for created ordered extent will only
1946                  * be freed by btrfs_finish_ordered_io().
1947                  */
1948                 if (ret)
1949                         goto error;
1950                 if (cur_offset > end)
1951                         break;
1952         }
1953         btrfs_release_path(path);
1954
1955         if (cur_offset <= end && cow_start == (u64)-1)
1956                 cow_start = cur_offset;
1957
1958         if (cow_start != (u64)-1) {
1959                 cur_offset = end;
1960                 ret = fallback_to_cow(inode, locked_page, cow_start, end,
1961                                       page_started, nr_written);
1962                 if (ret)
1963                         goto error;
1964         }
1965
1966 error:
1967         if (nocow)
1968                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1969
1970         if (ret && cur_offset < end)
1971                 extent_clear_unlock_delalloc(inode, cur_offset, end,
1972                                              locked_page, EXTENT_LOCKED |
1973                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1974                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1975                                              PAGE_START_WRITEBACK |
1976                                              PAGE_END_WRITEBACK);
1977         btrfs_free_path(path);
1978         return ret;
1979 }
1980
1981 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
1982 {
1983         if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
1984                 if (inode->defrag_bytes &&
1985                     test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
1986                                    0, NULL))
1987                         return false;
1988                 return true;
1989         }
1990         return false;
1991 }
1992
1993 /*
1994  * Function to process delayed allocation (create CoW) for ranges which are
1995  * being touched for the first time.
1996  */
1997 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
1998                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
1999                 struct writeback_control *wbc)
2000 {
2001         int ret;
2002         const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2003
2004         /*
2005          * The range must cover part of the @locked_page, or the returned
2006          * @page_started can confuse the caller.
2007          */
2008         ASSERT(!(end <= page_offset(locked_page) ||
2009                  start >= page_offset(locked_page) + PAGE_SIZE));
2010
2011         if (should_nocow(inode, start, end)) {
2012                 /*
2013                  * Normally on a zoned device we're only doing COW writes, but
2014                  * in case of relocation on a zoned filesystem we have taken
2015                  * precaution, that we're only writing sequentially. It's safe
2016                  * to use run_delalloc_nocow() here, like for  regular
2017                  * preallocated inodes.
2018                  */
2019                 ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
2020                 ret = run_delalloc_nocow(inode, locked_page, start, end,
2021                                          page_started, nr_written);
2022         } else if (!inode_can_compress(inode) ||
2023                    !inode_need_compress(inode, start, end)) {
2024                 if (zoned)
2025                         ret = run_delalloc_zoned(inode, locked_page, start, end,
2026                                                  page_started, nr_written);
2027                 else
2028                         ret = cow_file_range(inode, locked_page, start, end,
2029                                              page_started, nr_written, 1);
2030         } else {
2031                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
2032                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
2033                                            page_started, nr_written);
2034         }
2035         ASSERT(ret <= 0);
2036         if (ret)
2037                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
2038                                               end - start + 1);
2039         return ret;
2040 }
2041
2042 void btrfs_split_delalloc_extent(struct inode *inode,
2043                                  struct extent_state *orig, u64 split)
2044 {
2045         u64 size;
2046
2047         /* not delalloc, ignore it */
2048         if (!(orig->state & EXTENT_DELALLOC))
2049                 return;
2050
2051         size = orig->end - orig->start + 1;
2052         if (size > BTRFS_MAX_EXTENT_SIZE) {
2053                 u32 num_extents;
2054                 u64 new_size;
2055
2056                 /*
2057                  * See the explanation in btrfs_merge_delalloc_extent, the same
2058                  * applies here, just in reverse.
2059                  */
2060                 new_size = orig->end - split + 1;
2061                 num_extents = count_max_extents(new_size);
2062                 new_size = split - orig->start;
2063                 num_extents += count_max_extents(new_size);
2064                 if (count_max_extents(size) >= num_extents)
2065                         return;
2066         }
2067
2068         spin_lock(&BTRFS_I(inode)->lock);
2069         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
2070         spin_unlock(&BTRFS_I(inode)->lock);
2071 }
2072
2073 /*
2074  * Handle merged delayed allocation extents so we can keep track of new extents
2075  * that are just merged onto old extents, such as when we are doing sequential
2076  * writes, so we can properly account for the metadata space we'll need.
2077  */
2078 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
2079                                  struct extent_state *other)
2080 {
2081         u64 new_size, old_size;
2082         u32 num_extents;
2083
2084         /* not delalloc, ignore it */
2085         if (!(other->state & EXTENT_DELALLOC))
2086                 return;
2087
2088         if (new->start > other->start)
2089                 new_size = new->end - other->start + 1;
2090         else
2091                 new_size = other->end - new->start + 1;
2092
2093         /* we're not bigger than the max, unreserve the space and go */
2094         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
2095                 spin_lock(&BTRFS_I(inode)->lock);
2096                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2097                 spin_unlock(&BTRFS_I(inode)->lock);
2098                 return;
2099         }
2100
2101         /*
2102          * We have to add up either side to figure out how many extents were
2103          * accounted for before we merged into one big extent.  If the number of
2104          * extents we accounted for is <= the amount we need for the new range
2105          * then we can return, otherwise drop.  Think of it like this
2106          *
2107          * [ 4k][MAX_SIZE]
2108          *
2109          * So we've grown the extent by a MAX_SIZE extent, this would mean we
2110          * need 2 outstanding extents, on one side we have 1 and the other side
2111          * we have 1 so they are == and we can return.  But in this case
2112          *
2113          * [MAX_SIZE+4k][MAX_SIZE+4k]
2114          *
2115          * Each range on their own accounts for 2 extents, but merged together
2116          * they are only 3 extents worth of accounting, so we need to drop in
2117          * this case.
2118          */
2119         old_size = other->end - other->start + 1;
2120         num_extents = count_max_extents(old_size);
2121         old_size = new->end - new->start + 1;
2122         num_extents += count_max_extents(old_size);
2123         if (count_max_extents(new_size) >= num_extents)
2124                 return;
2125
2126         spin_lock(&BTRFS_I(inode)->lock);
2127         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2128         spin_unlock(&BTRFS_I(inode)->lock);
2129 }
2130
2131 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2132                                       struct inode *inode)
2133 {
2134         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2135
2136         spin_lock(&root->delalloc_lock);
2137         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
2138                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
2139                               &root->delalloc_inodes);
2140                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2141                         &BTRFS_I(inode)->runtime_flags);
2142                 root->nr_delalloc_inodes++;
2143                 if (root->nr_delalloc_inodes == 1) {
2144                         spin_lock(&fs_info->delalloc_root_lock);
2145                         BUG_ON(!list_empty(&root->delalloc_root));
2146                         list_add_tail(&root->delalloc_root,
2147                                       &fs_info->delalloc_roots);
2148                         spin_unlock(&fs_info->delalloc_root_lock);
2149                 }
2150         }
2151         spin_unlock(&root->delalloc_lock);
2152 }
2153
2154
2155 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2156                                 struct btrfs_inode *inode)
2157 {
2158         struct btrfs_fs_info *fs_info = root->fs_info;
2159
2160         if (!list_empty(&inode->delalloc_inodes)) {
2161                 list_del_init(&inode->delalloc_inodes);
2162                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2163                           &inode->runtime_flags);
2164                 root->nr_delalloc_inodes--;
2165                 if (!root->nr_delalloc_inodes) {
2166                         ASSERT(list_empty(&root->delalloc_inodes));
2167                         spin_lock(&fs_info->delalloc_root_lock);
2168                         BUG_ON(list_empty(&root->delalloc_root));
2169                         list_del_init(&root->delalloc_root);
2170                         spin_unlock(&fs_info->delalloc_root_lock);
2171                 }
2172         }
2173 }
2174
2175 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2176                                      struct btrfs_inode *inode)
2177 {
2178         spin_lock(&root->delalloc_lock);
2179         __btrfs_del_delalloc_inode(root, inode);
2180         spin_unlock(&root->delalloc_lock);
2181 }
2182
2183 /*
2184  * Properly track delayed allocation bytes in the inode and to maintain the
2185  * list of inodes that have pending delalloc work to be done.
2186  */
2187 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
2188                                unsigned *bits)
2189 {
2190         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2191
2192         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
2193                 WARN_ON(1);
2194         /*
2195          * set_bit and clear bit hooks normally require _irqsave/restore
2196          * but in this case, we are only testing for the DELALLOC
2197          * bit, which is only set or cleared with irqs on
2198          */
2199         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2200                 struct btrfs_root *root = BTRFS_I(inode)->root;
2201                 u64 len = state->end + 1 - state->start;
2202                 u32 num_extents = count_max_extents(len);
2203                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
2204
2205                 spin_lock(&BTRFS_I(inode)->lock);
2206                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
2207                 spin_unlock(&BTRFS_I(inode)->lock);
2208
2209                 /* For sanity tests */
2210                 if (btrfs_is_testing(fs_info))
2211                         return;
2212
2213                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2214                                          fs_info->delalloc_batch);
2215                 spin_lock(&BTRFS_I(inode)->lock);
2216                 BTRFS_I(inode)->delalloc_bytes += len;
2217                 if (*bits & EXTENT_DEFRAG)
2218                         BTRFS_I(inode)->defrag_bytes += len;
2219                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2220                                          &BTRFS_I(inode)->runtime_flags))
2221                         btrfs_add_delalloc_inodes(root, inode);
2222                 spin_unlock(&BTRFS_I(inode)->lock);
2223         }
2224
2225         if (!(state->state & EXTENT_DELALLOC_NEW) &&
2226             (*bits & EXTENT_DELALLOC_NEW)) {
2227                 spin_lock(&BTRFS_I(inode)->lock);
2228                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
2229                         state->start;
2230                 spin_unlock(&BTRFS_I(inode)->lock);
2231         }
2232 }
2233
2234 /*
2235  * Once a range is no longer delalloc this function ensures that proper
2236  * accounting happens.
2237  */
2238 void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
2239                                  struct extent_state *state, unsigned *bits)
2240 {
2241         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
2242         struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
2243         u64 len = state->end + 1 - state->start;
2244         u32 num_extents = count_max_extents(len);
2245
2246         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
2247                 spin_lock(&inode->lock);
2248                 inode->defrag_bytes -= len;
2249                 spin_unlock(&inode->lock);
2250         }
2251
2252         /*
2253          * set_bit and clear bit hooks normally require _irqsave/restore
2254          * but in this case, we are only testing for the DELALLOC
2255          * bit, which is only set or cleared with irqs on
2256          */
2257         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2258                 struct btrfs_root *root = inode->root;
2259                 bool do_list = !btrfs_is_free_space_inode(inode);
2260
2261                 spin_lock(&inode->lock);
2262                 btrfs_mod_outstanding_extents(inode, -num_extents);
2263                 spin_unlock(&inode->lock);
2264
2265                 /*
2266                  * We don't reserve metadata space for space cache inodes so we
2267                  * don't need to call delalloc_release_metadata if there is an
2268                  * error.
2269                  */
2270                 if (*bits & EXTENT_CLEAR_META_RESV &&
2271                     root != fs_info->tree_root)
2272                         btrfs_delalloc_release_metadata(inode, len, false);
2273
2274                 /* For sanity tests. */
2275                 if (btrfs_is_testing(fs_info))
2276                         return;
2277
2278                 if (!btrfs_is_data_reloc_root(root) &&
2279                     do_list && !(state->state & EXTENT_NORESERVE) &&
2280                     (*bits & EXTENT_CLEAR_DATA_RESV))
2281                         btrfs_free_reserved_data_space_noquota(fs_info, len);
2282
2283                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2284                                          fs_info->delalloc_batch);
2285                 spin_lock(&inode->lock);
2286                 inode->delalloc_bytes -= len;
2287                 if (do_list && inode->delalloc_bytes == 0 &&
2288                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2289                                         &inode->runtime_flags))
2290                         btrfs_del_delalloc_inode(root, inode);
2291                 spin_unlock(&inode->lock);
2292         }
2293
2294         if ((state->state & EXTENT_DELALLOC_NEW) &&
2295             (*bits & EXTENT_DELALLOC_NEW)) {
2296                 spin_lock(&inode->lock);
2297                 ASSERT(inode->new_delalloc_bytes >= len);
2298                 inode->new_delalloc_bytes -= len;
2299                 if (*bits & EXTENT_ADD_INODE_BYTES)
2300                         inode_add_bytes(&inode->vfs_inode, len);
2301                 spin_unlock(&inode->lock);
2302         }
2303 }
2304
2305 /*
2306  * in order to insert checksums into the metadata in large chunks,
2307  * we wait until bio submission time.   All the pages in the bio are
2308  * checksummed and sums are attached onto the ordered extent record.
2309  *
2310  * At IO completion time the cums attached on the ordered extent record
2311  * are inserted into the btree
2312  */
2313 static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
2314                                            u64 dio_file_offset)
2315 {
2316         return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
2317 }
2318
2319 /*
2320  * Split an extent_map at [start, start + len]
2321  *
2322  * This function is intended to be used only for extract_ordered_extent().
2323  */
2324 static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
2325                           u64 pre, u64 post)
2326 {
2327         struct extent_map_tree *em_tree = &inode->extent_tree;
2328         struct extent_map *em;
2329         struct extent_map *split_pre = NULL;
2330         struct extent_map *split_mid = NULL;
2331         struct extent_map *split_post = NULL;
2332         int ret = 0;
2333         unsigned long flags;
2334
2335         /* Sanity check */
2336         if (pre == 0 && post == 0)
2337                 return 0;
2338
2339         split_pre = alloc_extent_map();
2340         if (pre)
2341                 split_mid = alloc_extent_map();
2342         if (post)
2343                 split_post = alloc_extent_map();
2344         if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
2345                 ret = -ENOMEM;
2346                 goto out;
2347         }
2348
2349         ASSERT(pre + post < len);
2350
2351         lock_extent(&inode->io_tree, start, start + len - 1);
2352         write_lock(&em_tree->lock);
2353         em = lookup_extent_mapping(em_tree, start, len);
2354         if (!em) {
2355                 ret = -EIO;
2356                 goto out_unlock;
2357         }
2358
2359         ASSERT(em->len == len);
2360         ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
2361         ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
2362         ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
2363         ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
2364         ASSERT(!list_empty(&em->list));
2365
2366         flags = em->flags;
2367         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
2368
2369         /* First, replace the em with a new extent_map starting from * em->start */
2370         split_pre->start = em->start;
2371         split_pre->len = (pre ? pre : em->len - post);
2372         split_pre->orig_start = split_pre->start;
2373         split_pre->block_start = em->block_start;
2374         split_pre->block_len = split_pre->len;
2375         split_pre->orig_block_len = split_pre->block_len;
2376         split_pre->ram_bytes = split_pre->len;
2377         split_pre->flags = flags;
2378         split_pre->compress_type = em->compress_type;
2379         split_pre->generation = em->generation;
2380
2381         replace_extent_mapping(em_tree, em, split_pre, 1);
2382
2383         /*
2384          * Now we only have an extent_map at:
2385          *     [em->start, em->start + pre] if pre != 0
2386          *     [em->start, em->start + em->len - post] if pre == 0
2387          */
2388
2389         if (pre) {
2390                 /* Insert the middle extent_map */
2391                 split_mid->start = em->start + pre;
2392                 split_mid->len = em->len - pre - post;
2393                 split_mid->orig_start = split_mid->start;
2394                 split_mid->block_start = em->block_start + pre;
2395                 split_mid->block_len = split_mid->len;
2396                 split_mid->orig_block_len = split_mid->block_len;
2397                 split_mid->ram_bytes = split_mid->len;
2398                 split_mid->flags = flags;
2399                 split_mid->compress_type = em->compress_type;
2400                 split_mid->generation = em->generation;
2401                 add_extent_mapping(em_tree, split_mid, 1);
2402         }
2403
2404         if (post) {
2405                 split_post->start = em->start + em->len - post;
2406                 split_post->len = post;
2407                 split_post->orig_start = split_post->start;
2408                 split_post->block_start = em->block_start + em->len - post;
2409                 split_post->block_len = split_post->len;
2410                 split_post->orig_block_len = split_post->block_len;
2411                 split_post->ram_bytes = split_post->len;
2412                 split_post->flags = flags;
2413                 split_post->compress_type = em->compress_type;
2414                 split_post->generation = em->generation;
2415                 add_extent_mapping(em_tree, split_post, 1);
2416         }
2417
2418         /* Once for us */
2419         free_extent_map(em);
2420         /* Once for the tree */
2421         free_extent_map(em);
2422
2423 out_unlock:
2424         write_unlock(&em_tree->lock);
2425         unlock_extent(&inode->io_tree, start, start + len - 1);
2426 out:
2427         free_extent_map(split_pre);
2428         free_extent_map(split_mid);
2429         free_extent_map(split_post);
2430
2431         return ret;
2432 }
2433
2434 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
2435                                            struct bio *bio, loff_t file_offset)
2436 {
2437         struct btrfs_ordered_extent *ordered;
2438         u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
2439         u64 file_len;
2440         u64 len = bio->bi_iter.bi_size;
2441         u64 end = start + len;
2442         u64 ordered_end;
2443         u64 pre, post;
2444         int ret = 0;
2445
2446         ordered = btrfs_lookup_ordered_extent(inode, file_offset);
2447         if (WARN_ON_ONCE(!ordered))
2448                 return BLK_STS_IOERR;
2449
2450         /* No need to split */
2451         if (ordered->disk_num_bytes == len)
2452                 goto out;
2453
2454         /* We cannot split once end_bio'd ordered extent */
2455         if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
2456                 ret = -EINVAL;
2457                 goto out;
2458         }
2459
2460         /* We cannot split a compressed ordered extent */
2461         if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
2462                 ret = -EINVAL;
2463                 goto out;
2464         }
2465
2466         ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
2467         /* bio must be in one ordered extent */
2468         if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
2469                 ret = -EINVAL;
2470                 goto out;
2471         }
2472
2473         /* Checksum list should be empty */
2474         if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
2475                 ret = -EINVAL;
2476                 goto out;
2477         }
2478
2479         file_len = ordered->num_bytes;
2480         pre = start - ordered->disk_bytenr;
2481         post = ordered_end - end;
2482
2483         ret = btrfs_split_ordered_extent(ordered, pre, post);
2484         if (ret)
2485                 goto out;
2486         ret = split_zoned_em(inode, file_offset, file_len, pre, post);
2487
2488 out:
2489         btrfs_put_ordered_extent(ordered);
2490
2491         return errno_to_blk_status(ret);
2492 }
2493
2494 /*
2495  * extent_io.c submission hook. This does the right thing for csum calculation
2496  * on write, or reading the csums from the tree before a read.
2497  *
2498  * Rules about async/sync submit,
2499  * a) read:                             sync submit
2500  *
2501  * b) write without checksum:           sync submit
2502  *
2503  * c) write with checksum:
2504  *    c-1) if bio is issued by fsync:   sync submit
2505  *         (sync_writers != 0)
2506  *
2507  *    c-2) if root is reloc root:       sync submit
2508  *         (only in case of buffered IO)
2509  *
2510  *    c-3) otherwise:                   async submit
2511  */
2512 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
2513                                    int mirror_num, unsigned long bio_flags)
2514
2515 {
2516         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2517         struct btrfs_root *root = BTRFS_I(inode)->root;
2518         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2519         blk_status_t ret = 0;
2520         int skip_sum;
2521         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2522
2523         skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
2524                 test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2525
2526         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2527                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2528
2529         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
2530                 struct page *page = bio_first_bvec_all(bio)->bv_page;
2531                 loff_t file_offset = page_offset(page);
2532
2533                 ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
2534                 if (ret)
2535                         goto out;
2536         }
2537
2538         if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
2539                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2540                 if (ret)
2541                         goto out;
2542
2543                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2544                         /*
2545                          * btrfs_submit_compressed_read will handle completing
2546                          * the bio if there were any errors, so just return
2547                          * here.
2548                          */
2549                         ret = btrfs_submit_compressed_read(inode, bio,
2550                                                            mirror_num,
2551                                                            bio_flags);
2552                         goto out_no_endio;
2553                 } else {
2554                         /*
2555                          * Lookup bio sums does extra checks around whether we
2556                          * need to csum or not, which is why we ignore skip_sum
2557                          * here.
2558                          */
2559                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2560                         if (ret)
2561                                 goto out;
2562                 }
2563                 goto mapit;
2564         } else if (async && !skip_sum) {
2565                 /* csum items have already been cloned */
2566                 if (btrfs_is_data_reloc_root(root))
2567                         goto mapit;
2568                 /* we're doing a write, do the async checksumming */
2569                 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
2570                                           0, btrfs_submit_bio_start);
2571                 goto out;
2572         } else if (!skip_sum) {
2573                 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
2574                 if (ret)
2575                         goto out;
2576         }
2577
2578 mapit:
2579         ret = btrfs_map_bio(fs_info, bio, mirror_num);
2580
2581 out:
2582         if (ret) {
2583                 bio->bi_status = ret;
2584                 bio_endio(bio);
2585         }
2586 out_no_endio:
2587         return ret;
2588 }
2589
2590 /*
2591  * given a list of ordered sums record them in the inode.  This happens
2592  * at IO completion time based on sums calculated at bio submission time.
2593  */
2594 static int add_pending_csums(struct btrfs_trans_handle *trans,
2595                              struct list_head *list)
2596 {
2597         struct btrfs_ordered_sum *sum;
2598         struct btrfs_root *csum_root = NULL;
2599         int ret;
2600
2601         list_for_each_entry(sum, list, list) {
2602                 trans->adding_csums = true;
2603                 if (!csum_root)
2604                         csum_root = btrfs_csum_root(trans->fs_info,
2605                                                     sum->bytenr);
2606                 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2607                 trans->adding_csums = false;
2608                 if (ret)
2609                         return ret;
2610         }
2611         return 0;
2612 }
2613
2614 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2615                                          const u64 start,
2616                                          const u64 len,
2617                                          struct extent_state **cached_state)
2618 {
2619         u64 search_start = start;
2620         const u64 end = start + len - 1;
2621
2622         while (search_start < end) {
2623                 const u64 search_len = end - search_start + 1;
2624                 struct extent_map *em;
2625                 u64 em_len;
2626                 int ret = 0;
2627
2628                 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2629                 if (IS_ERR(em))
2630                         return PTR_ERR(em);
2631
2632                 if (em->block_start != EXTENT_MAP_HOLE)
2633                         goto next;
2634
2635                 em_len = em->len;
2636                 if (em->start < search_start)
2637                         em_len -= search_start - em->start;
2638                 if (em_len > search_len)
2639                         em_len = search_len;
2640
2641                 ret = set_extent_bit(&inode->io_tree, search_start,
2642                                      search_start + em_len - 1,
2643                                      EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
2644                                      GFP_NOFS, NULL);
2645 next:
2646                 search_start = extent_map_end(em);
2647                 free_extent_map(em);
2648                 if (ret)
2649                         return ret;
2650         }
2651         return 0;
2652 }
2653
2654 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2655                               unsigned int extra_bits,
2656                               struct extent_state **cached_state)
2657 {
2658         WARN_ON(PAGE_ALIGNED(end));
2659
2660         if (start >= i_size_read(&inode->vfs_inode) &&
2661             !(inode->flags & BTRFS_INODE_PREALLOC)) {
2662                 /*
2663                  * There can't be any extents following eof in this case so just
2664                  * set the delalloc new bit for the range directly.
2665                  */
2666                 extra_bits |= EXTENT_DELALLOC_NEW;
2667         } else {
2668                 int ret;
2669
2670                 ret = btrfs_find_new_delalloc_bytes(inode, start,
2671                                                     end + 1 - start,
2672                                                     cached_state);
2673                 if (ret)
2674                         return ret;
2675         }
2676
2677         return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
2678                                    cached_state);
2679 }
2680
2681 /* see btrfs_writepage_start_hook for details on why this is required */
2682 struct btrfs_writepage_fixup {
2683         struct page *page;
2684         struct inode *inode;
2685         struct btrfs_work work;
2686 };
2687
2688 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2689 {
2690         struct btrfs_writepage_fixup *fixup;
2691         struct btrfs_ordered_extent *ordered;
2692         struct extent_state *cached_state = NULL;
2693         struct extent_changeset *data_reserved = NULL;
2694         struct page *page;
2695         struct btrfs_inode *inode;
2696         u64 page_start;
2697         u64 page_end;
2698         int ret = 0;
2699         bool free_delalloc_space = true;
2700
2701         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2702         page = fixup->page;
2703         inode = BTRFS_I(fixup->inode);
2704         page_start = page_offset(page);
2705         page_end = page_offset(page) + PAGE_SIZE - 1;
2706
2707         /*
2708          * This is similar to page_mkwrite, we need to reserve the space before
2709          * we take the page lock.
2710          */
2711         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2712                                            PAGE_SIZE);
2713 again:
2714         lock_page(page);
2715
2716         /*
2717          * Before we queued this fixup, we took a reference on the page.
2718          * page->mapping may go NULL, but it shouldn't be moved to a different
2719          * address space.
2720          */
2721         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2722                 /*
2723                  * Unfortunately this is a little tricky, either
2724                  *
2725                  * 1) We got here and our page had already been dealt with and
2726                  *    we reserved our space, thus ret == 0, so we need to just
2727                  *    drop our space reservation and bail.  This can happen the
2728                  *    first time we come into the fixup worker, or could happen
2729                  *    while waiting for the ordered extent.
2730                  * 2) Our page was already dealt with, but we happened to get an
2731                  *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2732                  *    this case we obviously don't have anything to release, but
2733                  *    because the page was already dealt with we don't want to
2734                  *    mark the page with an error, so make sure we're resetting
2735                  *    ret to 0.  This is why we have this check _before_ the ret
2736                  *    check, because we do not want to have a surprise ENOSPC
2737                  *    when the page was already properly dealt with.
2738                  */
2739                 if (!ret) {
2740                         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2741                         btrfs_delalloc_release_space(inode, data_reserved,
2742                                                      page_start, PAGE_SIZE,
2743                                                      true);
2744                 }
2745                 ret = 0;
2746                 goto out_page;
2747         }
2748
2749         /*
2750          * We can't mess with the page state unless it is locked, so now that
2751          * it is locked bail if we failed to make our space reservation.
2752          */
2753         if (ret)
2754                 goto out_page;
2755
2756         lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
2757
2758         /* already ordered? We're done */
2759         if (PageOrdered(page))
2760                 goto out_reserved;
2761
2762         ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2763         if (ordered) {
2764                 unlock_extent_cached(&inode->io_tree, page_start, page_end,
2765                                      &cached_state);
2766                 unlock_page(page);
2767                 btrfs_start_ordered_extent(ordered, 1);
2768                 btrfs_put_ordered_extent(ordered);
2769                 goto again;
2770         }
2771
2772         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2773                                         &cached_state);
2774         if (ret)
2775                 goto out_reserved;
2776
2777         /*
2778          * Everything went as planned, we're now the owner of a dirty page with
2779          * delayed allocation bits set and space reserved for our COW
2780          * destination.
2781          *
2782          * The page was dirty when we started, nothing should have cleaned it.
2783          */
2784         BUG_ON(!PageDirty(page));
2785         free_delalloc_space = false;
2786 out_reserved:
2787         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2788         if (free_delalloc_space)
2789                 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2790                                              PAGE_SIZE, true);
2791         unlock_extent_cached(&inode->io_tree, page_start, page_end,
2792                              &cached_state);
2793 out_page:
2794         if (ret) {
2795                 /*
2796                  * We hit ENOSPC or other errors.  Update the mapping and page
2797                  * to reflect the errors and clean the page.
2798                  */
2799                 mapping_set_error(page->mapping, ret);
2800                 end_extent_writepage(page, ret, page_start, page_end);
2801                 clear_page_dirty_for_io(page);
2802                 SetPageError(page);
2803         }
2804         btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
2805         unlock_page(page);
2806         put_page(page);
2807         kfree(fixup);
2808         extent_changeset_free(data_reserved);
2809         /*
2810          * As a precaution, do a delayed iput in case it would be the last iput
2811          * that could need flushing space. Recursing back to fixup worker would
2812          * deadlock.
2813          */
2814         btrfs_add_delayed_iput(&inode->vfs_inode);
2815 }
2816
2817 /*
2818  * There are a few paths in the higher layers of the kernel that directly
2819  * set the page dirty bit without asking the filesystem if it is a
2820  * good idea.  This causes problems because we want to make sure COW
2821  * properly happens and the data=ordered rules are followed.
2822  *
2823  * In our case any range that doesn't have the ORDERED bit set
2824  * hasn't been properly setup for IO.  We kick off an async process
2825  * to fix it up.  The async helper will wait for ordered extents, set
2826  * the delalloc bit and make it safe to write the page.
2827  */
2828 int btrfs_writepage_cow_fixup(struct page *page)
2829 {
2830         struct inode *inode = page->mapping->host;
2831         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2832         struct btrfs_writepage_fixup *fixup;
2833
2834         /* This page has ordered extent covering it already */
2835         if (PageOrdered(page))
2836                 return 0;
2837
2838         /*
2839          * PageChecked is set below when we create a fixup worker for this page,
2840          * don't try to create another one if we're already PageChecked()
2841          *
2842          * The extent_io writepage code will redirty the page if we send back
2843          * EAGAIN.
2844          */
2845         if (PageChecked(page))
2846                 return -EAGAIN;
2847
2848         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2849         if (!fixup)
2850                 return -EAGAIN;
2851
2852         /*
2853          * We are already holding a reference to this inode from
2854          * write_cache_pages.  We need to hold it because the space reservation
2855          * takes place outside of the page lock, and we can't trust
2856          * page->mapping outside of the page lock.
2857          */
2858         ihold(inode);
2859         btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2860         get_page(page);
2861         btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2862         fixup->page = page;
2863         fixup->inode = inode;
2864         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2865
2866         return -EAGAIN;
2867 }
2868
2869 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2870                                        struct btrfs_inode *inode, u64 file_pos,
2871                                        struct btrfs_file_extent_item *stack_fi,
2872                                        const bool update_inode_bytes,
2873                                        u64 qgroup_reserved)
2874 {
2875         struct btrfs_root *root = inode->root;
2876         const u64 sectorsize = root->fs_info->sectorsize;
2877         struct btrfs_path *path;
2878         struct extent_buffer *leaf;
2879         struct btrfs_key ins;
2880         u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2881         u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2882         u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2883         u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2884         u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2885         struct btrfs_drop_extents_args drop_args = { 0 };
2886         int ret;
2887
2888         path = btrfs_alloc_path();
2889         if (!path)
2890                 return -ENOMEM;
2891
2892         /*
2893          * we may be replacing one extent in the tree with another.
2894          * The new extent is pinned in the extent map, and we don't want
2895          * to drop it from the cache until it is completely in the btree.
2896          *
2897          * So, tell btrfs_drop_extents to leave this extent in the cache.
2898          * the caller is expected to unpin it and allow it to be merged
2899          * with the others.
2900          */
2901         drop_args.path = path;
2902         drop_args.start = file_pos;
2903         drop_args.end = file_pos + num_bytes;
2904         drop_args.replace_extent = true;
2905         drop_args.extent_item_size = sizeof(*stack_fi);
2906         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2907         if (ret)
2908                 goto out;
2909
2910         if (!drop_args.extent_inserted) {
2911                 ins.objectid = btrfs_ino(inode);
2912                 ins.offset = file_pos;
2913                 ins.type = BTRFS_EXTENT_DATA_KEY;
2914
2915                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2916                                               sizeof(*stack_fi));
2917                 if (ret)
2918                         goto out;
2919         }
2920         leaf = path->nodes[0];
2921         btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2922         write_extent_buffer(leaf, stack_fi,
2923                         btrfs_item_ptr_offset(leaf, path->slots[0]),
2924                         sizeof(struct btrfs_file_extent_item));
2925
2926         btrfs_mark_buffer_dirty(leaf);
2927         btrfs_release_path(path);
2928
2929         /*
2930          * If we dropped an inline extent here, we know the range where it is
2931          * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2932          * number of bytes only for that range containing the inline extent.
2933          * The remaining of the range will be processed when clearning the
2934          * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2935          */
2936         if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2937                 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2938
2939                 inline_size = drop_args.bytes_found - inline_size;
2940                 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2941                 drop_args.bytes_found -= inline_size;
2942                 num_bytes -= sectorsize;
2943         }
2944
2945         if (update_inode_bytes)
2946                 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2947
2948         ins.objectid = disk_bytenr;
2949         ins.offset = disk_num_bytes;
2950         ins.type = BTRFS_EXTENT_ITEM_KEY;
2951
2952         ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2953         if (ret)
2954                 goto out;
2955
2956         ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2957                                                file_pos - offset,
2958                                                qgroup_reserved, &ins);
2959 out:
2960         btrfs_free_path(path);
2961
2962         return ret;
2963 }
2964
2965 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2966                                          u64 start, u64 len)
2967 {
2968         struct btrfs_block_group *cache;
2969
2970         cache = btrfs_lookup_block_group(fs_info, start);
2971         ASSERT(cache);
2972
2973         spin_lock(&cache->lock);
2974         cache->delalloc_bytes -= len;
2975         spin_unlock(&cache->lock);
2976
2977         btrfs_put_block_group(cache);
2978 }
2979
2980 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2981                                              struct btrfs_ordered_extent *oe)
2982 {
2983         struct btrfs_file_extent_item stack_fi;
2984         bool update_inode_bytes;
2985         u64 num_bytes = oe->num_bytes;
2986         u64 ram_bytes = oe->ram_bytes;
2987
2988         memset(&stack_fi, 0, sizeof(stack_fi));
2989         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2990         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2991         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2992                                                    oe->disk_num_bytes);
2993         btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
2994         if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
2995                 num_bytes = ram_bytes = oe->truncated_len;
2996         btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
2997         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
2998         btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2999         /* Encryption and other encoding is reserved and all 0 */
3000
3001         /*
3002          * For delalloc, when completing an ordered extent we update the inode's
3003          * bytes when clearing the range in the inode's io tree, so pass false
3004          * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3005          * except if the ordered extent was truncated.
3006          */
3007         update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3008                              test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3009                              test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3010
3011         return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3012                                            oe->file_offset, &stack_fi,
3013                                            update_inode_bytes, oe->qgroup_rsv);
3014 }
3015
3016 /*
3017  * As ordered data IO finishes, this gets called so we can finish
3018  * an ordered extent if the range of bytes in the file it covers are
3019  * fully written.
3020  */
3021 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
3022 {
3023         struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3024         struct btrfs_root *root = inode->root;
3025         struct btrfs_fs_info *fs_info = root->fs_info;
3026         struct btrfs_trans_handle *trans = NULL;
3027         struct extent_io_tree *io_tree = &inode->io_tree;
3028         struct extent_state *cached_state = NULL;
3029         u64 start, end;
3030         int compress_type = 0;
3031         int ret = 0;
3032         u64 logical_len = ordered_extent->num_bytes;
3033         bool freespace_inode;
3034         bool truncated = false;
3035         bool clear_reserved_extent = true;
3036         unsigned int clear_bits = EXTENT_DEFRAG;
3037
3038         start = ordered_extent->file_offset;
3039         end = start + ordered_extent->num_bytes - 1;
3040
3041         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3042             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3043             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3044             !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3045                 clear_bits |= EXTENT_DELALLOC_NEW;
3046
3047         freespace_inode = btrfs_is_free_space_inode(inode);
3048
3049         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3050                 ret = -EIO;
3051                 goto out;
3052         }
3053
3054         /* A valid bdev implies a write on a sequential zone */
3055         if (ordered_extent->bdev) {
3056                 btrfs_rewrite_logical_zoned(ordered_extent);
3057                 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3058                                         ordered_extent->disk_num_bytes);
3059         }
3060
3061         btrfs_free_io_failure_record(inode, start, end);
3062
3063         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3064                 truncated = true;
3065                 logical_len = ordered_extent->truncated_len;
3066                 /* Truncated the entire extent, don't bother adding */
3067                 if (!logical_len)
3068                         goto out;
3069         }
3070
3071         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3072                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3073
3074                 btrfs_inode_safe_disk_i_size_write(inode, 0);
3075                 if (freespace_inode)
3076                         trans = btrfs_join_transaction_spacecache(root);
3077                 else
3078                         trans = btrfs_join_transaction(root);
3079                 if (IS_ERR(trans)) {
3080                         ret = PTR_ERR(trans);
3081                         trans = NULL;
3082                         goto out;
3083                 }
3084                 trans->block_rsv = &inode->block_rsv;
3085                 ret = btrfs_update_inode_fallback(trans, root, inode);
3086                 if (ret) /* -ENOMEM or corruption */
3087                         btrfs_abort_transaction(trans, ret);
3088                 goto out;
3089         }
3090
3091         clear_bits |= EXTENT_LOCKED;
3092         lock_extent_bits(io_tree, start, end, &cached_state);
3093
3094         if (freespace_inode)
3095                 trans = btrfs_join_transaction_spacecache(root);
3096         else
3097                 trans = btrfs_join_transaction(root);
3098         if (IS_ERR(trans)) {
3099                 ret = PTR_ERR(trans);
3100                 trans = NULL;
3101                 goto out;
3102         }
3103
3104         trans->block_rsv = &inode->block_rsv;
3105
3106         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3107                 compress_type = ordered_extent->compress_type;
3108         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3109                 BUG_ON(compress_type);
3110                 ret = btrfs_mark_extent_written(trans, inode,
3111                                                 ordered_extent->file_offset,
3112                                                 ordered_extent->file_offset +
3113                                                 logical_len);
3114         } else {
3115                 BUG_ON(root == fs_info->tree_root);
3116                 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3117                 if (!ret) {
3118                         clear_reserved_extent = false;
3119                         btrfs_release_delalloc_bytes(fs_info,
3120                                                 ordered_extent->disk_bytenr,
3121                                                 ordered_extent->disk_num_bytes);
3122                 }
3123         }
3124         unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3125                            ordered_extent->num_bytes, trans->transid);
3126         if (ret < 0) {
3127                 btrfs_abort_transaction(trans, ret);
3128                 goto out;
3129         }
3130
3131         ret = add_pending_csums(trans, &ordered_extent->list);
3132         if (ret) {
3133                 btrfs_abort_transaction(trans, ret);
3134                 goto out;
3135         }
3136
3137         /*
3138          * If this is a new delalloc range, clear its new delalloc flag to
3139          * update the inode's number of bytes. This needs to be done first
3140          * before updating the inode item.
3141          */
3142         if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3143             !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3144                 clear_extent_bit(&inode->io_tree, start, end,
3145                                  EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3146                                  0, 0, &cached_state);
3147
3148         btrfs_inode_safe_disk_i_size_write(inode, 0);
3149         ret = btrfs_update_inode_fallback(trans, root, inode);
3150         if (ret) { /* -ENOMEM or corruption */
3151                 btrfs_abort_transaction(trans, ret);
3152                 goto out;
3153         }
3154         ret = 0;
3155 out:
3156         clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3157                          (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
3158                          &cached_state);
3159
3160         if (trans)
3161                 btrfs_end_transaction(trans);
3162
3163         if (ret || truncated) {
3164                 u64 unwritten_start = start;
3165
3166                 /*
3167                  * If we failed to finish this ordered extent for any reason we
3168                  * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3169                  * extent, and mark the inode with the error if it wasn't
3170                  * already set.  Any error during writeback would have already
3171                  * set the mapping error, so we need to set it if we're the ones
3172                  * marking this ordered extent as failed.
3173                  */
3174                 if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3175                                              &ordered_extent->flags))
3176                         mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3177
3178                 if (truncated)
3179                         unwritten_start += logical_len;
3180                 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3181
3182                 /* Drop the cache for the part of the extent we didn't write. */
3183                 btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
3184
3185                 /*
3186                  * If the ordered extent had an IOERR or something else went
3187                  * wrong we need to return the space for this ordered extent
3188                  * back to the allocator.  We only free the extent in the
3189                  * truncated case if we didn't write out the extent at all.
3190                  *
3191                  * If we made it past insert_reserved_file_extent before we
3192                  * errored out then we don't need to do this as the accounting
3193                  * has already been done.
3194                  */
3195                 if ((ret || !logical_len) &&
3196                     clear_reserved_extent &&
3197                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3198                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3199                         /*
3200                          * Discard the range before returning it back to the
3201                          * free space pool
3202                          */
3203                         if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3204                                 btrfs_discard_extent(fs_info,
3205                                                 ordered_extent->disk_bytenr,
3206                                                 ordered_extent->disk_num_bytes,
3207                                                 NULL);
3208                         btrfs_free_reserved_extent(fs_info,
3209                                         ordered_extent->disk_bytenr,
3210                                         ordered_extent->disk_num_bytes, 1);
3211                 }
3212         }
3213
3214         /*
3215          * This needs to be done to make sure anybody waiting knows we are done
3216          * updating everything for this ordered extent.
3217          */
3218         btrfs_remove_ordered_extent(inode, ordered_extent);
3219
3220         /* once for us */
3221         btrfs_put_ordered_extent(ordered_extent);
3222         /* once for the tree */
3223         btrfs_put_ordered_extent(ordered_extent);
3224
3225         return ret;
3226 }
3227
3228 static void finish_ordered_fn(struct btrfs_work *work)
3229 {
3230         struct btrfs_ordered_extent *ordered_extent;
3231         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3232         btrfs_finish_ordered_io(ordered_extent);
3233 }
3234
3235 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
3236                                           struct page *page, u64 start,
3237                                           u64 end, bool uptodate)
3238 {
3239         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
3240
3241         btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
3242                                        finish_ordered_fn, uptodate);
3243 }
3244
3245 /*
3246  * check_data_csum - verify checksum of one sector of uncompressed data
3247  * @inode:      inode
3248  * @io_bio:     btrfs_io_bio which contains the csum
3249  * @bio_offset: offset to the beginning of the bio (in bytes)
3250  * @page:       page where is the data to be verified
3251  * @pgoff:      offset inside the page
3252  * @start:      logical offset in the file
3253  *
3254  * The length of such check is always one sector size.
3255  */
3256 static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
3257                            u32 bio_offset, struct page *page, u32 pgoff,
3258                            u64 start)
3259 {
3260         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3261         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3262         char *kaddr;
3263         u32 len = fs_info->sectorsize;
3264         const u32 csum_size = fs_info->csum_size;
3265         unsigned int offset_sectors;
3266         u8 *csum_expected;
3267         u8 csum[BTRFS_CSUM_SIZE];
3268
3269         ASSERT(pgoff + len <= PAGE_SIZE);
3270
3271         offset_sectors = bio_offset >> fs_info->sectorsize_bits;
3272         csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
3273
3274         kaddr = kmap_atomic(page);
3275         shash->tfm = fs_info->csum_shash;
3276
3277         crypto_shash_digest(shash, kaddr + pgoff, len, csum);
3278
3279         if (memcmp(csum, csum_expected, csum_size))
3280                 goto zeroit;
3281
3282         kunmap_atomic(kaddr);
3283         return 0;
3284 zeroit:
3285         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3286                                     bbio->mirror_num);
3287         if (bbio->device)
3288                 btrfs_dev_stat_inc_and_print(bbio->device,
3289                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
3290         memset(kaddr + pgoff, 1, len);
3291         flush_dcache_page(page);
3292         kunmap_atomic(kaddr);
3293         return -EIO;
3294 }
3295
3296 /*
3297  * When reads are done, we need to check csums to verify the data is correct.
3298  * if there's a match, we allow the bio to finish.  If not, the code in
3299  * extent_io.c will try to find good copies for us.
3300  *
3301  * @bio_offset: offset to the beginning of the bio (in bytes)
3302  * @start:      file offset of the range start
3303  * @end:        file offset of the range end (inclusive)
3304  *
3305  * Return a bitmap where bit set means a csum mismatch, and bit not set means
3306  * csum match.
3307  */
3308 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
3309                                     u32 bio_offset, struct page *page,
3310                                     u64 start, u64 end)
3311 {
3312         struct inode *inode = page->mapping->host;
3313         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3314         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3315         struct btrfs_root *root = BTRFS_I(inode)->root;
3316         const u32 sectorsize = root->fs_info->sectorsize;
3317         u32 pg_off;
3318         unsigned int result = 0;
3319
3320         if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
3321                 btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
3322                 return 0;
3323         }
3324
3325         /*
3326          * This only happens for NODATASUM or compressed read.
3327          * Normally this should be covered by above check for compressed read
3328          * or the next check for NODATASUM.  Just do a quicker exit here.
3329          */
3330         if (bbio->csum == NULL)
3331                 return 0;
3332
3333         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3334                 return 0;
3335
3336         if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
3337                 return 0;
3338
3339         ASSERT(page_offset(page) <= start &&
3340                end <= page_offset(page) + PAGE_SIZE - 1);
3341         for (pg_off = offset_in_page(start);
3342              pg_off < offset_in_page(end);
3343              pg_off += sectorsize, bio_offset += sectorsize) {
3344                 u64 file_offset = pg_off + page_offset(page);
3345                 int ret;
3346
3347                 if (btrfs_is_data_reloc_root(root) &&
3348                     test_range_bit(io_tree, file_offset,
3349                                    file_offset + sectorsize - 1,
3350                                    EXTENT_NODATASUM, 1, NULL)) {
3351                         /* Skip the range without csum for data reloc inode */
3352                         clear_extent_bits(io_tree, file_offset,
3353                                           file_offset + sectorsize - 1,
3354                                           EXTENT_NODATASUM);
3355                         continue;
3356                 }
3357                 ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
3358                                       page_offset(page) + pg_off);
3359                 if (ret < 0) {
3360                         const int nr_bit = (pg_off - offset_in_page(start)) >>
3361                                      root->fs_info->sectorsize_bits;
3362
3363                         result |= (1U << nr_bit);
3364                 }
3365         }
3366         return result;
3367 }
3368
3369 /*
3370  * btrfs_add_delayed_iput - perform a delayed iput on @inode
3371  *
3372  * @inode: The inode we want to perform iput on
3373  *
3374  * This function uses the generic vfs_inode::i_count to track whether we should
3375  * just decrement it (in case it's > 1) or if this is the last iput then link
3376  * the inode to the delayed iput machinery. Delayed iputs are processed at
3377  * transaction commit time/superblock commit/cleaner kthread.
3378  */
3379 void btrfs_add_delayed_iput(struct inode *inode)
3380 {
3381         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3382         struct btrfs_inode *binode = BTRFS_I(inode);
3383
3384         if (atomic_add_unless(&inode->i_count, -1, 1))
3385                 return;
3386
3387         atomic_inc(&fs_info->nr_delayed_iputs);
3388         spin_lock(&fs_info->delayed_iput_lock);
3389         ASSERT(list_empty(&binode->delayed_iput));
3390         list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3391         spin_unlock(&fs_info->delayed_iput_lock);
3392         if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3393                 wake_up_process(fs_info->cleaner_kthread);
3394 }
3395
3396 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3397                                     struct btrfs_inode *inode)
3398 {
3399         list_del_init(&inode->delayed_iput);
3400         spin_unlock(&fs_info->delayed_iput_lock);
3401         iput(&inode->vfs_inode);
3402         if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3403                 wake_up(&fs_info->delayed_iputs_wait);
3404         spin_lock(&fs_info->delayed_iput_lock);
3405 }
3406
3407 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3408                                    struct btrfs_inode *inode)
3409 {
3410         if (!list_empty(&inode->delayed_iput)) {
3411                 spin_lock(&fs_info->delayed_iput_lock);
3412                 if (!list_empty(&inode->delayed_iput))
3413                         run_delayed_iput_locked(fs_info, inode);
3414                 spin_unlock(&fs_info->delayed_iput_lock);
3415         }
3416 }
3417
3418 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3419 {
3420
3421         spin_lock(&fs_info->delayed_iput_lock);
3422         while (!list_empty(&fs_info->delayed_iputs)) {
3423                 struct btrfs_inode *inode;
3424
3425                 inode = list_first_entry(&fs_info->delayed_iputs,
3426                                 struct btrfs_inode, delayed_iput);
3427                 run_delayed_iput_locked(fs_info, inode);
3428                 cond_resched_lock(&fs_info->delayed_iput_lock);
3429         }
3430         spin_unlock(&fs_info->delayed_iput_lock);
3431 }
3432
3433 /**
3434  * Wait for flushing all delayed iputs
3435  *
3436  * @fs_info:  the filesystem
3437  *
3438  * This will wait on any delayed iputs that are currently running with KILLABLE
3439  * set.  Once they are all done running we will return, unless we are killed in
3440  * which case we return EINTR. This helps in user operations like fallocate etc
3441  * that might get blocked on the iputs.
3442  *
3443  * Return EINTR if we were killed, 0 if nothing's pending
3444  */
3445 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3446 {
3447         int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3448                         atomic_read(&fs_info->nr_delayed_iputs) == 0);
3449         if (ret)
3450                 return -EINTR;
3451         return 0;
3452 }
3453
3454 /*
3455  * This creates an orphan entry for the given inode in case something goes wrong
3456  * in the middle of an unlink.
3457  */
3458 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3459                      struct btrfs_inode *inode)
3460 {
3461         int ret;
3462
3463         ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3464         if (ret && ret != -EEXIST) {
3465                 btrfs_abort_transaction(trans, ret);
3466                 return ret;
3467         }
3468
3469         return 0;
3470 }
3471
3472 /*
3473  * We have done the delete so we can go ahead and remove the orphan item for
3474  * this particular inode.
3475  */
3476 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3477                             struct btrfs_inode *inode)
3478 {
3479         return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3480 }
3481
3482 /*
3483  * this cleans up any orphans that may be left on the list from the last use
3484  * of this root.
3485  */
3486 int btrfs_orphan_cleanup(struct btrfs_root *root)
3487 {
3488         struct btrfs_fs_info *fs_info = root->fs_info;
3489         struct btrfs_path *path;
3490         struct extent_buffer *leaf;
3491         struct btrfs_key key, found_key;
3492         struct btrfs_trans_handle *trans;
3493         struct inode *inode;
3494         u64 last_objectid = 0;
3495         int ret = 0, nr_unlink = 0;
3496
3497         if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3498                 return 0;
3499
3500         path = btrfs_alloc_path();
3501         if (!path) {
3502                 ret = -ENOMEM;
3503                 goto out;
3504         }
3505         path->reada = READA_BACK;
3506
3507         key.objectid = BTRFS_ORPHAN_OBJECTID;
3508         key.type = BTRFS_ORPHAN_ITEM_KEY;
3509         key.offset = (u64)-1;
3510
3511         while (1) {
3512                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3513                 if (ret < 0)
3514                         goto out;
3515
3516                 /*
3517                  * if ret == 0 means we found what we were searching for, which
3518                  * is weird, but possible, so only screw with path if we didn't
3519                  * find the key and see if we have stuff that matches
3520                  */
3521                 if (ret > 0) {
3522                         ret = 0;
3523                         if (path->slots[0] == 0)
3524                                 break;
3525                         path->slots[0]--;
3526                 }
3527
3528                 /* pull out the item */
3529                 leaf = path->nodes[0];
3530                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3531
3532                 /* make sure the item matches what we want */
3533                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3534                         break;
3535                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3536                         break;
3537
3538                 /* release the path since we're done with it */
3539                 btrfs_release_path(path);
3540
3541                 /*
3542                  * this is where we are basically btrfs_lookup, without the
3543                  * crossing root thing.  we store the inode number in the
3544                  * offset of the orphan item.
3545                  */
3546
3547                 if (found_key.offset == last_objectid) {
3548                         btrfs_err(fs_info,
3549                                   "Error removing orphan entry, stopping orphan cleanup");
3550                         ret = -EINVAL;
3551                         goto out;
3552                 }
3553
3554                 last_objectid = found_key.offset;
3555
3556                 found_key.objectid = found_key.offset;
3557                 found_key.type = BTRFS_INODE_ITEM_KEY;
3558                 found_key.offset = 0;
3559                 inode = btrfs_iget(fs_info->sb, last_objectid, root);
3560                 ret = PTR_ERR_OR_ZERO(inode);
3561                 if (ret && ret != -ENOENT)
3562                         goto out;
3563
3564                 if (ret == -ENOENT && root == fs_info->tree_root) {
3565                         struct btrfs_root *dead_root;
3566                         int is_dead_root = 0;
3567
3568                         /*
3569                          * This is an orphan in the tree root. Currently these
3570                          * could come from 2 sources:
3571                          *  a) a root (snapshot/subvolume) deletion in progress
3572                          *  b) a free space cache inode
3573                          * We need to distinguish those two, as the orphan item
3574                          * for a root must not get deleted before the deletion
3575                          * of the snapshot/subvolume's tree completes.
3576                          *
3577                          * btrfs_find_orphan_roots() ran before us, which has
3578                          * found all deleted roots and loaded them into
3579                          * fs_info->fs_roots_radix. So here we can find if an
3580                          * orphan item corresponds to a deleted root by looking
3581                          * up the root from that radix tree.
3582                          */
3583
3584                         spin_lock(&fs_info->fs_roots_radix_lock);
3585                         dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3586                                                          (unsigned long)found_key.objectid);
3587                         if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3588                                 is_dead_root = 1;
3589                         spin_unlock(&fs_info->fs_roots_radix_lock);
3590
3591                         if (is_dead_root) {
3592                                 /* prevent this orphan from being found again */
3593                                 key.offset = found_key.objectid - 1;
3594                                 continue;
3595                         }
3596
3597                 }
3598
3599                 /*
3600                  * If we have an inode with links, there are a couple of
3601                  * possibilities:
3602                  *
3603                  * 1. We were halfway through creating fsverity metadata for the
3604                  * file. In that case, the orphan item represents incomplete
3605                  * fsverity metadata which must be cleaned up with
3606                  * btrfs_drop_verity_items and deleting the orphan item.
3607
3608                  * 2. Old kernels (before v3.12) used to create an
3609                  * orphan item for truncate indicating that there were possibly
3610                  * extent items past i_size that needed to be deleted. In v3.12,
3611                  * truncate was changed to update i_size in sync with the extent
3612                  * items, but the (useless) orphan item was still created. Since
3613                  * v4.18, we don't create the orphan item for truncate at all.
3614                  *
3615                  * So, this item could mean that we need to do a truncate, but
3616                  * only if this filesystem was last used on a pre-v3.12 kernel
3617                  * and was not cleanly unmounted. The odds of that are quite
3618                  * slim, and it's a pain to do the truncate now, so just delete
3619                  * the orphan item.
3620                  *
3621                  * It's also possible that this orphan item was supposed to be
3622                  * deleted but wasn't. The inode number may have been reused,
3623                  * but either way, we can delete the orphan item.
3624                  */
3625                 if (ret == -ENOENT || inode->i_nlink) {
3626                         if (!ret) {
3627                                 ret = btrfs_drop_verity_items(BTRFS_I(inode));
3628                                 iput(inode);
3629                                 if (ret)
3630                                         goto out;
3631                         }
3632                         trans = btrfs_start_transaction(root, 1);
3633                         if (IS_ERR(trans)) {
3634                                 ret = PTR_ERR(trans);
3635                                 goto out;
3636                         }
3637                         btrfs_debug(fs_info, "auto deleting %Lu",
3638                                     found_key.objectid);
3639                         ret = btrfs_del_orphan_item(trans, root,
3640                                                     found_key.objectid);
3641                         btrfs_end_transaction(trans);
3642                         if (ret)
3643                                 goto out;
3644                         continue;
3645                 }
3646
3647                 nr_unlink++;
3648
3649                 /* this will do delete_inode and everything for us */
3650                 iput(inode);
3651         }
3652         /* release the path since we're done with it */
3653         btrfs_release_path(path);
3654
3655         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3656                 trans = btrfs_join_transaction(root);
3657                 if (!IS_ERR(trans))
3658                         btrfs_end_transaction(trans);
3659         }
3660
3661         if (nr_unlink)
3662                 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3663
3664 out:
3665         if (ret)
3666                 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3667         btrfs_free_path(path);
3668         return ret;
3669 }
3670
3671 /*
3672  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3673  * don't find any xattrs, we know there can't be any acls.
3674  *
3675  * slot is the slot the inode is in, objectid is the objectid of the inode
3676  */
3677 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3678                                           int slot, u64 objectid,
3679                                           int *first_xattr_slot)
3680 {
3681         u32 nritems = btrfs_header_nritems(leaf);
3682         struct btrfs_key found_key;
3683         static u64 xattr_access = 0;
3684         static u64 xattr_default = 0;
3685         int scanned = 0;
3686
3687         if (!xattr_access) {
3688                 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3689                                         strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3690                 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3691                                         strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3692         }
3693
3694         slot++;
3695         *first_xattr_slot = -1;
3696         while (slot < nritems) {
3697                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3698
3699                 /* we found a different objectid, there must not be acls */
3700                 if (found_key.objectid != objectid)
3701                         return 0;
3702
3703                 /* we found an xattr, assume we've got an acl */
3704                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3705                         if (*first_xattr_slot == -1)
3706                                 *first_xattr_slot = slot;
3707                         if (found_key.offset == xattr_access ||
3708                             found_key.offset == xattr_default)
3709                                 return 1;
3710                 }
3711
3712                 /*
3713                  * we found a key greater than an xattr key, there can't
3714                  * be any acls later on
3715                  */
3716                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3717                         return 0;
3718
3719                 slot++;
3720                 scanned++;
3721
3722                 /*
3723                  * it goes inode, inode backrefs, xattrs, extents,
3724                  * so if there are a ton of hard links to an inode there can
3725                  * be a lot of backrefs.  Don't waste time searching too hard,
3726                  * this is just an optimization
3727                  */
3728                 if (scanned >= 8)
3729                         break;
3730         }
3731         /* we hit the end of the leaf before we found an xattr or
3732          * something larger than an xattr.  We have to assume the inode
3733          * has acls
3734          */
3735         if (*first_xattr_slot == -1)
3736                 *first_xattr_slot = slot;
3737         return 1;
3738 }
3739
3740 /*
3741  * read an inode from the btree into the in-memory inode
3742  */
3743 static int btrfs_read_locked_inode(struct inode *inode,
3744                                    struct btrfs_path *in_path)
3745 {
3746         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3747         struct btrfs_path *path = in_path;
3748         struct extent_buffer *leaf;
3749         struct btrfs_inode_item *inode_item;
3750         struct btrfs_root *root = BTRFS_I(inode)->root;
3751         struct btrfs_key location;
3752         unsigned long ptr;
3753         int maybe_acls;
3754         u32 rdev;
3755         int ret;
3756         bool filled = false;
3757         int first_xattr_slot;
3758
3759         ret = btrfs_fill_inode(inode, &rdev);
3760         if (!ret)
3761                 filled = true;
3762
3763         if (!path) {
3764                 path = btrfs_alloc_path();
3765                 if (!path)
3766                         return -ENOMEM;
3767         }
3768
3769         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3770
3771         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3772         if (ret) {
3773                 if (path != in_path)
3774                         btrfs_free_path(path);
3775                 return ret;
3776         }
3777
3778         leaf = path->nodes[0];
3779
3780         if (filled)
3781                 goto cache_index;
3782
3783         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3784                                     struct btrfs_inode_item);
3785         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3786         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3787         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3788         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3789         btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3790         btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3791                         round_up(i_size_read(inode), fs_info->sectorsize));
3792
3793         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3794         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3795
3796         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3797         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3798
3799         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3800         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3801
3802         BTRFS_I(inode)->i_otime.tv_sec =
3803                 btrfs_timespec_sec(leaf, &inode_item->otime);
3804         BTRFS_I(inode)->i_otime.tv_nsec =
3805                 btrfs_timespec_nsec(leaf, &inode_item->otime);
3806
3807         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3808         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3809         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3810
3811         inode_set_iversion_queried(inode,
3812                                    btrfs_inode_sequence(leaf, inode_item));
3813         inode->i_generation = BTRFS_I(inode)->generation;
3814         inode->i_rdev = 0;
3815         rdev = btrfs_inode_rdev(leaf, inode_item);
3816
3817         BTRFS_I(inode)->index_cnt = (u64)-1;
3818         btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3819                                 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3820
3821 cache_index:
3822         /*
3823          * If we were modified in the current generation and evicted from memory
3824          * and then re-read we need to do a full sync since we don't have any
3825          * idea about which extents were modified before we were evicted from
3826          * cache.
3827          *
3828          * This is required for both inode re-read from disk and delayed inode
3829          * in delayed_nodes_tree.
3830          */
3831         if (BTRFS_I(inode)->last_trans == fs_info->generation)
3832                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3833                         &BTRFS_I(inode)->runtime_flags);
3834
3835         /*
3836          * We don't persist the id of the transaction where an unlink operation
3837          * against the inode was last made. So here we assume the inode might
3838          * have been evicted, and therefore the exact value of last_unlink_trans
3839          * lost, and set it to last_trans to avoid metadata inconsistencies
3840          * between the inode and its parent if the inode is fsync'ed and the log
3841          * replayed. For example, in the scenario:
3842          *
3843          * touch mydir/foo
3844          * ln mydir/foo mydir/bar
3845          * sync
3846          * unlink mydir/bar
3847          * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3848          * xfs_io -c fsync mydir/foo
3849          * <power failure>
3850          * mount fs, triggers fsync log replay
3851          *
3852          * We must make sure that when we fsync our inode foo we also log its
3853          * parent inode, otherwise after log replay the parent still has the
3854          * dentry with the "bar" name but our inode foo has a link count of 1
3855          * and doesn't have an inode ref with the name "bar" anymore.
3856          *
3857          * Setting last_unlink_trans to last_trans is a pessimistic approach,
3858          * but it guarantees correctness at the expense of occasional full
3859          * transaction commits on fsync if our inode is a directory, or if our
3860          * inode is not a directory, logging its parent unnecessarily.
3861          */
3862         BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3863
3864         /*
3865          * Same logic as for last_unlink_trans. We don't persist the generation
3866          * of the last transaction where this inode was used for a reflink
3867          * operation, so after eviction and reloading the inode we must be
3868          * pessimistic and assume the last transaction that modified the inode.
3869          */
3870         BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3871
3872         path->slots[0]++;
3873         if (inode->i_nlink != 1 ||
3874             path->slots[0] >= btrfs_header_nritems(leaf))
3875                 goto cache_acl;
3876
3877         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3878         if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3879                 goto cache_acl;
3880
3881         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3882         if (location.type == BTRFS_INODE_REF_KEY) {
3883                 struct btrfs_inode_ref *ref;
3884
3885                 ref = (struct btrfs_inode_ref *)ptr;
3886                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3887         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3888                 struct btrfs_inode_extref *extref;
3889
3890                 extref = (struct btrfs_inode_extref *)ptr;
3891                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3892                                                                      extref);
3893         }
3894 cache_acl:
3895         /*
3896          * try to precache a NULL acl entry for files that don't have
3897          * any xattrs or acls
3898          */
3899         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3900                         btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3901         if (first_xattr_slot != -1) {
3902                 path->slots[0] = first_xattr_slot;
3903                 ret = btrfs_load_inode_props(inode, path);
3904                 if (ret)
3905                         btrfs_err(fs_info,
3906                                   "error loading props for ino %llu (root %llu): %d",
3907                                   btrfs_ino(BTRFS_I(inode)),
3908                                   root->root_key.objectid, ret);
3909         }
3910         if (path != in_path)
3911                 btrfs_free_path(path);
3912
3913         if (!maybe_acls)
3914                 cache_no_acl(inode);
3915
3916         switch (inode->i_mode & S_IFMT) {
3917         case S_IFREG:
3918                 inode->i_mapping->a_ops = &btrfs_aops;
3919                 inode->i_fop = &btrfs_file_operations;
3920                 inode->i_op = &btrfs_file_inode_operations;
3921                 break;
3922         case S_IFDIR:
3923                 inode->i_fop = &btrfs_dir_file_operations;
3924                 inode->i_op = &btrfs_dir_inode_operations;
3925                 break;
3926         case S_IFLNK:
3927                 inode->i_op = &btrfs_symlink_inode_operations;
3928                 inode_nohighmem(inode);
3929                 inode->i_mapping->a_ops = &btrfs_aops;
3930                 break;
3931         default:
3932                 inode->i_op = &btrfs_special_inode_operations;
3933                 init_special_inode(inode, inode->i_mode, rdev);
3934                 break;
3935         }
3936
3937         btrfs_sync_inode_flags_to_i_flags(inode);
3938         return 0;
3939 }
3940
3941 /*
3942  * given a leaf and an inode, copy the inode fields into the leaf
3943  */
3944 static void fill_inode_item(struct btrfs_trans_handle *trans,
3945                             struct extent_buffer *leaf,
3946                             struct btrfs_inode_item *item,
3947                             struct inode *inode)
3948 {
3949         struct btrfs_map_token token;
3950         u64 flags;
3951
3952         btrfs_init_map_token(&token, leaf);
3953
3954         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3955         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3956         btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3957         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3958         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3959
3960         btrfs_set_token_timespec_sec(&token, &item->atime,
3961                                      inode->i_atime.tv_sec);
3962         btrfs_set_token_timespec_nsec(&token, &item->atime,
3963                                       inode->i_atime.tv_nsec);
3964
3965         btrfs_set_token_timespec_sec(&token, &item->mtime,
3966                                      inode->i_mtime.tv_sec);
3967         btrfs_set_token_timespec_nsec(&token, &item->mtime,
3968                                       inode->i_mtime.tv_nsec);
3969
3970         btrfs_set_token_timespec_sec(&token, &item->ctime,
3971                                      inode->i_ctime.tv_sec);
3972         btrfs_set_token_timespec_nsec(&token, &item->ctime,
3973                                       inode->i_ctime.tv_nsec);
3974
3975         btrfs_set_token_timespec_sec(&token, &item->otime,
3976                                      BTRFS_I(inode)->i_otime.tv_sec);
3977         btrfs_set_token_timespec_nsec(&token, &item->otime,
3978                                       BTRFS_I(inode)->i_otime.tv_nsec);
3979
3980         btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3981         btrfs_set_token_inode_generation(&token, item,
3982                                          BTRFS_I(inode)->generation);
3983         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3984         btrfs_set_token_inode_transid(&token, item, trans->transid);
3985         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3986         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3987                                           BTRFS_I(inode)->ro_flags);
3988         btrfs_set_token_inode_flags(&token, item, flags);
3989         btrfs_set_token_inode_block_group(&token, item, 0);
3990 }
3991
3992 /*
3993  * copy everything in the in-memory inode into the btree.
3994  */
3995 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3996                                 struct btrfs_root *root,
3997                                 struct btrfs_inode *inode)
3998 {
3999         struct btrfs_inode_item *inode_item;
4000         struct btrfs_path *path;
4001         struct extent_buffer *leaf;
4002         int ret;
4003
4004         path = btrfs_alloc_path();
4005         if (!path)
4006                 return -ENOMEM;
4007
4008         ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
4009         if (ret) {
4010                 if (ret > 0)
4011                         ret = -ENOENT;
4012                 goto failed;
4013         }
4014
4015         leaf = path->nodes[0];
4016         inode_item = btrfs_item_ptr(leaf, path->slots[0],
4017                                     struct btrfs_inode_item);
4018
4019         fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4020         btrfs_mark_buffer_dirty(leaf);
4021         btrfs_set_inode_last_trans(trans, inode);
4022         ret = 0;
4023 failed:
4024         btrfs_free_path(path);
4025         return ret;
4026 }
4027
4028 /*
4029  * copy everything in the in-memory inode into the btree.
4030  */
4031 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4032                                 struct btrfs_root *root,
4033                                 struct btrfs_inode *inode)
4034 {
4035         struct btrfs_fs_info *fs_info = root->fs_info;
4036         int ret;
4037
4038         /*
4039          * If the inode is a free space inode, we can deadlock during commit
4040          * if we put it into the delayed code.
4041          *
4042          * The data relocation inode should also be directly updated
4043          * without delay
4044          */
4045         if (!btrfs_is_free_space_inode(inode)
4046             && !btrfs_is_data_reloc_root(root)
4047             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4048                 btrfs_update_root_times(trans, root);
4049
4050                 ret = btrfs_delayed_update_inode(trans, root, inode);
4051                 if (!ret)
4052                         btrfs_set_inode_last_trans(trans, inode);
4053                 return ret;
4054         }
4055
4056         return btrfs_update_inode_item(trans, root, inode);
4057 }
4058
4059 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4060                                 struct btrfs_root *root, struct btrfs_inode *inode)
4061 {
4062         int ret;
4063
4064         ret = btrfs_update_inode(trans, root, inode);
4065         if (ret == -ENOSPC)
4066                 return btrfs_update_inode_item(trans, root, inode);
4067         return ret;
4068 }
4069
4070 /*
4071  * unlink helper that gets used here in inode.c and in the tree logging
4072  * recovery code.  It remove a link in a directory with a given name, and
4073  * also drops the back refs in the inode to the directory
4074  */
4075 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4076                                 struct btrfs_inode *dir,
4077                                 struct btrfs_inode *inode,
4078                                 const char *name, int name_len,
4079                                 struct btrfs_rename_ctx *rename_ctx)
4080 {
4081         struct btrfs_root *root = dir->root;
4082         struct btrfs_fs_info *fs_info = root->fs_info;
4083         struct btrfs_path *path;
4084         int ret = 0;
4085         struct btrfs_dir_item *di;
4086         u64 index;
4087         u64 ino = btrfs_ino(inode);
4088         u64 dir_ino = btrfs_ino(dir);
4089
4090         path = btrfs_alloc_path();
4091         if (!path) {
4092                 ret = -ENOMEM;
4093                 goto out;
4094         }
4095
4096         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4097                                     name, name_len, -1);
4098         if (IS_ERR_OR_NULL(di)) {
4099                 ret = di ? PTR_ERR(di) : -ENOENT;
4100                 goto err;
4101         }
4102         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4103         if (ret)
4104                 goto err;
4105         btrfs_release_path(path);
4106
4107         /*
4108          * If we don't have dir index, we have to get it by looking up
4109          * the inode ref, since we get the inode ref, remove it directly,
4110          * it is unnecessary to do delayed deletion.
4111          *
4112          * But if we have dir index, needn't search inode ref to get it.
4113          * Since the inode ref is close to the inode item, it is better
4114          * that we delay to delete it, and just do this deletion when
4115          * we update the inode item.
4116          */
4117         if (inode->dir_index) {
4118                 ret = btrfs_delayed_delete_inode_ref(inode);
4119                 if (!ret) {
4120                         index = inode->dir_index;
4121                         goto skip_backref;
4122                 }
4123         }
4124
4125         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4126                                   dir_ino, &index);
4127         if (ret) {
4128                 btrfs_info(fs_info,
4129                         "failed to delete reference to %.*s, inode %llu parent %llu",
4130                         name_len, name, ino, dir_ino);
4131                 btrfs_abort_transaction(trans, ret);
4132                 goto err;
4133         }
4134 skip_backref:
4135         if (rename_ctx)
4136                 rename_ctx->index = index;
4137
4138         ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4139         if (ret) {
4140                 btrfs_abort_transaction(trans, ret);
4141                 goto err;
4142         }
4143
4144         /*
4145          * If we are in a rename context, we don't need to update anything in the
4146          * log. That will be done later during the rename by btrfs_log_new_name().
4147          * Besides that, doing it here would only cause extra unncessary btree
4148          * operations on the log tree, increasing latency for applications.
4149          */
4150         if (!rename_ctx) {
4151                 btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4152                                            dir_ino);
4153                 btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4154                                              index);
4155         }
4156
4157         /*
4158          * If we have a pending delayed iput we could end up with the final iput
4159          * being run in btrfs-cleaner context.  If we have enough of these built
4160          * up we can end up burning a lot of time in btrfs-cleaner without any
4161          * way to throttle the unlinks.  Since we're currently holding a ref on
4162          * the inode we can run the delayed iput here without any issues as the
4163          * final iput won't be done until after we drop the ref we're currently
4164          * holding.
4165          */
4166         btrfs_run_delayed_iput(fs_info, inode);
4167 err:
4168         btrfs_free_path(path);
4169         if (ret)
4170                 goto out;
4171
4172         btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4173         inode_inc_iversion(&inode->vfs_inode);
4174         inode_inc_iversion(&dir->vfs_inode);
4175         inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4176                 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4177         ret = btrfs_update_inode(trans, root, dir);
4178 out:
4179         return ret;
4180 }
4181
4182 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4183                        struct btrfs_inode *dir, struct btrfs_inode *inode,
4184                        const char *name, int name_len)
4185 {
4186         int ret;
4187         ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
4188         if (!ret) {
4189                 drop_nlink(&inode->vfs_inode);
4190                 ret = btrfs_update_inode(trans, inode->root, inode);
4191         }
4192         return ret;
4193 }
4194
4195 /*
4196  * helper to start transaction for unlink and rmdir.
4197  *
4198  * unlink and rmdir are special in btrfs, they do not always free space, so
4199  * if we cannot make our reservations the normal way try and see if there is
4200  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4201  * allow the unlink to occur.
4202  */
4203 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4204 {
4205         struct btrfs_root *root = BTRFS_I(dir)->root;
4206
4207         /*
4208          * 1 for the possible orphan item
4209          * 1 for the dir item
4210          * 1 for the dir index
4211          * 1 for the inode ref
4212          * 1 for the inode
4213          */
4214         return btrfs_start_transaction_fallback_global_rsv(root, 5);
4215 }
4216
4217 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4218 {
4219         struct btrfs_trans_handle *trans;
4220         struct inode *inode = d_inode(dentry);
4221         int ret;
4222
4223         trans = __unlink_start_trans(dir);
4224         if (IS_ERR(trans))
4225                 return PTR_ERR(trans);
4226
4227         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4228                         0);
4229
4230         ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
4231                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4232                         dentry->d_name.len);
4233         if (ret)
4234                 goto out;
4235
4236         if (inode->i_nlink == 0) {
4237                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4238                 if (ret)
4239                         goto out;
4240         }
4241
4242 out:
4243         btrfs_end_transaction(trans);
4244         btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4245         return ret;
4246 }
4247
4248 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4249                                struct inode *dir, struct dentry *dentry)
4250 {
4251         struct btrfs_root *root = BTRFS_I(dir)->root;
4252         struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4253         struct btrfs_path *path;
4254         struct extent_buffer *leaf;
4255         struct btrfs_dir_item *di;
4256         struct btrfs_key key;
4257         const char *name = dentry->d_name.name;
4258         int name_len = dentry->d_name.len;
4259         u64 index;
4260         int ret;
4261         u64 objectid;
4262         u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4263
4264         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4265                 objectid = inode->root->root_key.objectid;
4266         } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4267                 objectid = inode->location.objectid;
4268         } else {
4269                 WARN_ON(1);
4270                 return -EINVAL;
4271         }
4272
4273         path = btrfs_alloc_path();
4274         if (!path)
4275                 return -ENOMEM;
4276
4277         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4278                                    name, name_len, -1);
4279         if (IS_ERR_OR_NULL(di)) {
4280                 ret = di ? PTR_ERR(di) : -ENOENT;
4281                 goto out;
4282         }
4283
4284         leaf = path->nodes[0];
4285         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4286         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4287         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4288         if (ret) {
4289                 btrfs_abort_transaction(trans, ret);
4290                 goto out;
4291         }
4292         btrfs_release_path(path);
4293
4294         /*
4295          * This is a placeholder inode for a subvolume we didn't have a
4296          * reference to at the time of the snapshot creation.  In the meantime
4297          * we could have renamed the real subvol link into our snapshot, so
4298          * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4299          * Instead simply lookup the dir_index_item for this entry so we can
4300          * remove it.  Otherwise we know we have a ref to the root and we can
4301          * call btrfs_del_root_ref, and it _shouldn't_ fail.
4302          */
4303         if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4304                 di = btrfs_search_dir_index_item(root, path, dir_ino,
4305                                                  name, name_len);
4306                 if (IS_ERR_OR_NULL(di)) {
4307                         if (!di)
4308                                 ret = -ENOENT;
4309                         else
4310                                 ret = PTR_ERR(di);
4311                         btrfs_abort_transaction(trans, ret);
4312                         goto out;
4313                 }
4314
4315                 leaf = path->nodes[0];
4316                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4317                 index = key.offset;
4318                 btrfs_release_path(path);
4319         } else {
4320                 ret = btrfs_del_root_ref(trans, objectid,
4321                                          root->root_key.objectid, dir_ino,
4322                                          &index, name, name_len);
4323                 if (ret) {
4324                         btrfs_abort_transaction(trans, ret);
4325                         goto out;
4326                 }
4327         }
4328
4329         ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
4330         if (ret) {
4331                 btrfs_abort_transaction(trans, ret);
4332                 goto out;
4333         }
4334
4335         btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4336         inode_inc_iversion(dir);
4337         dir->i_mtime = dir->i_ctime = current_time(dir);
4338         ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
4339         if (ret)
4340                 btrfs_abort_transaction(trans, ret);
4341 out:
4342         btrfs_free_path(path);
4343         return ret;
4344 }
4345
4346 /*
4347  * Helper to check if the subvolume references other subvolumes or if it's
4348  * default.
4349  */
4350 static noinline int may_destroy_subvol(struct btrfs_root *root)
4351 {
4352         struct btrfs_fs_info *fs_info = root->fs_info;
4353         struct btrfs_path *path;
4354         struct btrfs_dir_item *di;
4355         struct btrfs_key key;
4356         u64 dir_id;
4357         int ret;
4358
4359         path = btrfs_alloc_path();
4360         if (!path)
4361                 return -ENOMEM;
4362
4363         /* Make sure this root isn't set as the default subvol */
4364         dir_id = btrfs_super_root_dir(fs_info->super_copy);
4365         di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4366                                    dir_id, "default", 7, 0);
4367         if (di && !IS_ERR(di)) {
4368                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4369                 if (key.objectid == root->root_key.objectid) {
4370                         ret = -EPERM;
4371                         btrfs_err(fs_info,
4372                                   "deleting default subvolume %llu is not allowed",
4373                                   key.objectid);
4374                         goto out;
4375                 }
4376                 btrfs_release_path(path);
4377         }
4378
4379         key.objectid = root->root_key.objectid;
4380         key.type = BTRFS_ROOT_REF_KEY;
4381         key.offset = (u64)-1;
4382
4383         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4384         if (ret < 0)
4385                 goto out;
4386         BUG_ON(ret == 0);
4387
4388         ret = 0;
4389         if (path->slots[0] > 0) {
4390                 path->slots[0]--;
4391                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4392                 if (key.objectid == root->root_key.objectid &&
4393                     key.type == BTRFS_ROOT_REF_KEY)
4394                         ret = -ENOTEMPTY;
4395         }
4396 out:
4397         btrfs_free_path(path);
4398         return ret;
4399 }
4400
4401 /* Delete all dentries for inodes belonging to the root */
4402 static void btrfs_prune_dentries(struct btrfs_root *root)
4403 {
4404         struct btrfs_fs_info *fs_info = root->fs_info;
4405         struct rb_node *node;
4406         struct rb_node *prev;
4407         struct btrfs_inode *entry;
4408         struct inode *inode;
4409         u64 objectid = 0;
4410
4411         if (!BTRFS_FS_ERROR(fs_info))
4412                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4413
4414         spin_lock(&root->inode_lock);
4415 again:
4416         node = root->inode_tree.rb_node;
4417         prev = NULL;
4418         while (node) {
4419                 prev = node;
4420                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4421
4422                 if (objectid < btrfs_ino(entry))
4423                         node = node->rb_left;
4424                 else if (objectid > btrfs_ino(entry))
4425                         node = node->rb_right;
4426                 else
4427                         break;
4428         }
4429         if (!node) {
4430                 while (prev) {
4431                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
4432                         if (objectid <= btrfs_ino(entry)) {
4433                                 node = prev;
4434                                 break;
4435                         }
4436                         prev = rb_next(prev);
4437                 }
4438         }
4439         while (node) {
4440                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4441                 objectid = btrfs_ino(entry) + 1;
4442                 inode = igrab(&entry->vfs_inode);
4443                 if (inode) {
4444                         spin_unlock(&root->inode_lock);
4445                         if (atomic_read(&inode->i_count) > 1)
4446                                 d_prune_aliases(inode);
4447                         /*
4448                          * btrfs_drop_inode will have it removed from the inode
4449                          * cache when its usage count hits zero.
4450                          */
4451                         iput(inode);
4452                         cond_resched();
4453                         spin_lock(&root->inode_lock);
4454                         goto again;
4455                 }
4456
4457                 if (cond_resched_lock(&root->inode_lock))
4458                         goto again;
4459
4460                 node = rb_next(node);
4461         }
4462         spin_unlock(&root->inode_lock);
4463 }
4464
4465 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
4466 {
4467         struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4468         struct btrfs_root *root = BTRFS_I(dir)->root;
4469         struct inode *inode = d_inode(dentry);
4470         struct btrfs_root *dest = BTRFS_I(inode)->root;
4471         struct btrfs_trans_handle *trans;
4472         struct btrfs_block_rsv block_rsv;
4473         u64 root_flags;
4474         int ret;
4475
4476         /*
4477          * Don't allow to delete a subvolume with send in progress. This is
4478          * inside the inode lock so the error handling that has to drop the bit
4479          * again is not run concurrently.
4480          */
4481         spin_lock(&dest->root_item_lock);
4482         if (dest->send_in_progress) {
4483                 spin_unlock(&dest->root_item_lock);
4484                 btrfs_warn(fs_info,
4485                            "attempt to delete subvolume %llu during send",
4486                            dest->root_key.objectid);
4487                 return -EPERM;
4488         }
4489         if (atomic_read(&dest->nr_swapfiles)) {
4490                 spin_unlock(&dest->root_item_lock);
4491                 btrfs_warn(fs_info,
4492                            "attempt to delete subvolume %llu with active swapfile",
4493                            root->root_key.objectid);
4494                 return -EPERM;
4495         }
4496         root_flags = btrfs_root_flags(&dest->root_item);
4497         btrfs_set_root_flags(&dest->root_item,
4498                              root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4499         spin_unlock(&dest->root_item_lock);
4500
4501         down_write(&fs_info->subvol_sem);
4502
4503         ret = may_destroy_subvol(dest);
4504         if (ret)
4505                 goto out_up_write;
4506
4507         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4508         /*
4509          * One for dir inode,
4510          * two for dir entries,
4511          * two for root ref/backref.
4512          */
4513         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4514         if (ret)
4515                 goto out_up_write;
4516
4517         trans = btrfs_start_transaction(root, 0);
4518         if (IS_ERR(trans)) {
4519                 ret = PTR_ERR(trans);
4520                 goto out_release;
4521         }
4522         trans->block_rsv = &block_rsv;
4523         trans->bytes_reserved = block_rsv.size;
4524
4525         btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
4526
4527         ret = btrfs_unlink_subvol(trans, dir, dentry);
4528         if (ret) {
4529                 btrfs_abort_transaction(trans, ret);
4530                 goto out_end_trans;
4531         }
4532
4533         ret = btrfs_record_root_in_trans(trans, dest);
4534         if (ret) {
4535                 btrfs_abort_transaction(trans, ret);
4536                 goto out_end_trans;
4537         }
4538
4539         memset(&dest->root_item.drop_progress, 0,
4540                 sizeof(dest->root_item.drop_progress));
4541         btrfs_set_root_drop_level(&dest->root_item, 0);
4542         btrfs_set_root_refs(&dest->root_item, 0);
4543
4544         if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4545                 ret = btrfs_insert_orphan_item(trans,
4546                                         fs_info->tree_root,
4547                                         dest->root_key.objectid);
4548                 if (ret) {
4549                         btrfs_abort_transaction(trans, ret);
4550                         goto out_end_trans;
4551                 }
4552         }
4553
4554         ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4555                                   BTRFS_UUID_KEY_SUBVOL,
4556                                   dest->root_key.objectid);
4557         if (ret && ret != -ENOENT) {
4558                 btrfs_abort_transaction(trans, ret);
4559                 goto out_end_trans;
4560         }
4561         if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4562                 ret = btrfs_uuid_tree_remove(trans,
4563                                           dest->root_item.received_uuid,
4564                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4565                                           dest->root_key.objectid);
4566                 if (ret && ret != -ENOENT) {
4567                         btrfs_abort_transaction(trans, ret);
4568                         goto out_end_trans;
4569                 }
4570         }
4571
4572         free_anon_bdev(dest->anon_dev);
4573         dest->anon_dev = 0;
4574 out_end_trans:
4575         trans->block_rsv = NULL;
4576         trans->bytes_reserved = 0;
4577         ret = btrfs_end_transaction(trans);
4578         inode->i_flags |= S_DEAD;
4579 out_release:
4580         btrfs_subvolume_release_metadata(root, &block_rsv);
4581 out_up_write:
4582         up_write(&fs_info->subvol_sem);
4583         if (ret) {
4584                 spin_lock(&dest->root_item_lock);
4585                 root_flags = btrfs_root_flags(&dest->root_item);
4586                 btrfs_set_root_flags(&dest->root_item,
4587                                 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4588                 spin_unlock(&dest->root_item_lock);
4589         } else {
4590                 d_invalidate(dentry);
4591                 btrfs_prune_dentries(dest);
4592                 ASSERT(dest->send_in_progress == 0);
4593         }
4594
4595         return ret;
4596 }
4597
4598 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4599 {
4600         struct inode *inode = d_inode(dentry);
4601         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4602         int err = 0;
4603         struct btrfs_trans_handle *trans;
4604         u64 last_unlink_trans;
4605
4606         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4607                 return -ENOTEMPTY;
4608         if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4609                 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4610                         btrfs_err(fs_info,
4611                         "extent tree v2 doesn't support snapshot deletion yet");
4612                         return -EOPNOTSUPP;
4613                 }
4614                 return btrfs_delete_subvolume(dir, dentry);
4615         }
4616
4617         trans = __unlink_start_trans(dir);
4618         if (IS_ERR(trans))
4619                 return PTR_ERR(trans);
4620
4621         if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4622                 err = btrfs_unlink_subvol(trans, dir, dentry);
4623                 goto out;
4624         }
4625
4626         err = btrfs_orphan_add(trans, BTRFS_I(inode));
4627         if (err)
4628                 goto out;
4629
4630         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4631
4632         /* now the directory is empty */
4633         err = btrfs_unlink_inode(trans, BTRFS_I(dir),
4634                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4635                         dentry->d_name.len);
4636         if (!err) {
4637                 btrfs_i_size_write(BTRFS_I(inode), 0);
4638                 /*
4639                  * Propagate the last_unlink_trans value of the deleted dir to
4640                  * its parent directory. This is to prevent an unrecoverable
4641                  * log tree in the case we do something like this:
4642                  * 1) create dir foo
4643                  * 2) create snapshot under dir foo
4644                  * 3) delete the snapshot
4645                  * 4) rmdir foo
4646                  * 5) mkdir foo
4647                  * 6) fsync foo or some file inside foo
4648                  */
4649                 if (last_unlink_trans >= trans->transid)
4650                         BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4651         }
4652 out:
4653         btrfs_end_transaction(trans);
4654         btrfs_btree_balance_dirty(fs_info);
4655
4656         return err;
4657 }
4658
4659 /*
4660  * btrfs_truncate_block - read, zero a chunk and write a block
4661  * @inode - inode that we're zeroing
4662  * @from - the offset to start zeroing
4663  * @len - the length to zero, 0 to zero the entire range respective to the
4664  *      offset
4665  * @front - zero up to the offset instead of from the offset on
4666  *
4667  * This will find the block for the "from" offset and cow the block and zero the
4668  * part we want to zero.  This is used with truncate and hole punching.
4669  */
4670 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4671                          int front)
4672 {
4673         struct btrfs_fs_info *fs_info = inode->root->fs_info;
4674         struct address_space *mapping = inode->vfs_inode.i_mapping;
4675         struct extent_io_tree *io_tree = &inode->io_tree;
4676         struct btrfs_ordered_extent *ordered;
4677         struct extent_state *cached_state = NULL;
4678         struct extent_changeset *data_reserved = NULL;
4679         bool only_release_metadata = false;
4680         u32 blocksize = fs_info->sectorsize;
4681         pgoff_t index = from >> PAGE_SHIFT;
4682         unsigned offset = from & (blocksize - 1);
4683         struct page *page;
4684         gfp_t mask = btrfs_alloc_write_mask(mapping);
4685         size_t write_bytes = blocksize;
4686         int ret = 0;
4687         u64 block_start;
4688         u64 block_end;
4689
4690         if (IS_ALIGNED(offset, blocksize) &&
4691             (!len || IS_ALIGNED(len, blocksize)))
4692                 goto out;
4693
4694         block_start = round_down(from, blocksize);
4695         block_end = block_start + blocksize - 1;
4696
4697         ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4698                                           blocksize);
4699         if (ret < 0) {
4700                 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
4701                         /* For nocow case, no need to reserve data space */
4702                         only_release_metadata = true;
4703                 } else {
4704                         goto out;
4705                 }
4706         }
4707         ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
4708         if (ret < 0) {
4709                 if (!only_release_metadata)
4710                         btrfs_free_reserved_data_space(inode, data_reserved,
4711                                                        block_start, blocksize);
4712                 goto out;
4713         }
4714 again:
4715         page = find_or_create_page(mapping, index, mask);
4716         if (!page) {
4717                 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4718                                              blocksize, true);
4719                 btrfs_delalloc_release_extents(inode, blocksize);
4720                 ret = -ENOMEM;
4721                 goto out;
4722         }
4723         ret = set_page_extent_mapped(page);
4724         if (ret < 0)
4725                 goto out_unlock;
4726
4727         if (!PageUptodate(page)) {
4728                 ret = btrfs_readpage(NULL, page);
4729                 lock_page(page);
4730                 if (page->mapping != mapping) {
4731                         unlock_page(page);
4732                         put_page(page);
4733                         goto again;
4734                 }
4735                 if (!PageUptodate(page)) {
4736                         ret = -EIO;
4737                         goto out_unlock;
4738                 }
4739         }
4740         wait_on_page_writeback(page);
4741
4742         lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4743
4744         ordered = btrfs_lookup_ordered_extent(inode, block_start);
4745         if (ordered) {
4746                 unlock_extent_cached(io_tree, block_start, block_end,
4747                                      &cached_state);
4748                 unlock_page(page);
4749                 put_page(page);
4750                 btrfs_start_ordered_extent(ordered, 1);
4751                 btrfs_put_ordered_extent(ordered);
4752                 goto again;
4753         }
4754
4755         clear_extent_bit(&inode->io_tree, block_start, block_end,
4756                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4757                          0, 0, &cached_state);
4758
4759         ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4760                                         &cached_state);
4761         if (ret) {
4762                 unlock_extent_cached(io_tree, block_start, block_end,
4763                                      &cached_state);
4764                 goto out_unlock;
4765         }
4766
4767         if (offset != blocksize) {
4768                 if (!len)
4769                         len = blocksize - offset;
4770                 if (front)
4771                         memzero_page(page, (block_start - page_offset(page)),
4772                                      offset);
4773                 else
4774                         memzero_page(page, (block_start - page_offset(page)) + offset,
4775                                      len);
4776                 flush_dcache_page(page);
4777         }
4778         btrfs_page_clear_checked(fs_info, page, block_start,
4779                                  block_end + 1 - block_start);
4780         btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4781         unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
4782
4783         if (only_release_metadata)
4784                 set_extent_bit(&inode->io_tree, block_start, block_end,
4785                                EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
4786
4787 out_unlock:
4788         if (ret) {
4789                 if (only_release_metadata)
4790                         btrfs_delalloc_release_metadata(inode, blocksize, true);
4791                 else
4792                         btrfs_delalloc_release_space(inode, data_reserved,
4793                                         block_start, blocksize, true);
4794         }
4795         btrfs_delalloc_release_extents(inode, blocksize);
4796         unlock_page(page);
4797         put_page(page);
4798 out:
4799         if (only_release_metadata)
4800                 btrfs_check_nocow_unlock(inode);
4801         extent_changeset_free(data_reserved);
4802         return ret;
4803 }
4804
4805 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4806                              u64 offset, u64 len)
4807 {
4808         struct btrfs_fs_info *fs_info = root->fs_info;
4809         struct btrfs_trans_handle *trans;
4810         struct btrfs_drop_extents_args drop_args = { 0 };
4811         int ret;
4812
4813         /*
4814          * If NO_HOLES is enabled, we don't need to do anything.
4815          * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4816          * or btrfs_update_inode() will be called, which guarantee that the next
4817          * fsync will know this inode was changed and needs to be logged.
4818          */
4819         if (btrfs_fs_incompat(fs_info, NO_HOLES))
4820                 return 0;
4821
4822         /*
4823          * 1 - for the one we're dropping
4824          * 1 - for the one we're adding
4825          * 1 - for updating the inode.
4826          */
4827         trans = btrfs_start_transaction(root, 3);
4828         if (IS_ERR(trans))
4829                 return PTR_ERR(trans);
4830
4831         drop_args.start = offset;
4832         drop_args.end = offset + len;
4833         drop_args.drop_cache = true;
4834
4835         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4836         if (ret) {
4837                 btrfs_abort_transaction(trans, ret);
4838                 btrfs_end_transaction(trans);
4839                 return ret;
4840         }
4841
4842         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
4843                         offset, 0, 0, len, 0, len, 0, 0, 0);
4844         if (ret) {
4845                 btrfs_abort_transaction(trans, ret);
4846         } else {
4847                 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4848                 btrfs_update_inode(trans, root, inode);
4849         }
4850         btrfs_end_transaction(trans);
4851         return ret;
4852 }
4853
4854 /*
4855  * This function puts in dummy file extents for the area we're creating a hole
4856  * for.  So if we are truncating this file to a larger size we need to insert
4857  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4858  * the range between oldsize and size
4859  */
4860 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4861 {
4862         struct btrfs_root *root = inode->root;
4863         struct btrfs_fs_info *fs_info = root->fs_info;
4864         struct extent_io_tree *io_tree = &inode->io_tree;
4865         struct extent_map *em = NULL;
4866         struct extent_state *cached_state = NULL;
4867         struct extent_map_tree *em_tree = &inode->extent_tree;
4868         u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4869         u64 block_end = ALIGN(size, fs_info->sectorsize);
4870         u64 last_byte;
4871         u64 cur_offset;
4872         u64 hole_size;
4873         int err = 0;
4874
4875         /*
4876          * If our size started in the middle of a block we need to zero out the
4877          * rest of the block before we expand the i_size, otherwise we could
4878          * expose stale data.
4879          */
4880         err = btrfs_truncate_block(inode, oldsize, 0, 0);
4881         if (err)
4882                 return err;
4883
4884         if (size <= hole_start)
4885                 return 0;
4886
4887         btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4888                                            &cached_state);
4889         cur_offset = hole_start;
4890         while (1) {
4891                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4892                                       block_end - cur_offset);
4893                 if (IS_ERR(em)) {
4894                         err = PTR_ERR(em);
4895                         em = NULL;
4896                         break;
4897                 }
4898                 last_byte = min(extent_map_end(em), block_end);
4899                 last_byte = ALIGN(last_byte, fs_info->sectorsize);
4900                 hole_size = last_byte - cur_offset;
4901
4902                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4903                         struct extent_map *hole_em;
4904
4905                         err = maybe_insert_hole(root, inode, cur_offset,
4906                                                 hole_size);
4907                         if (err)
4908                                 break;
4909
4910                         err = btrfs_inode_set_file_extent_range(inode,
4911                                                         cur_offset, hole_size);
4912                         if (err)
4913                                 break;
4914
4915                         btrfs_drop_extent_cache(inode, cur_offset,
4916                                                 cur_offset + hole_size - 1, 0);
4917                         hole_em = alloc_extent_map();
4918                         if (!hole_em) {
4919                                 btrfs_set_inode_full_sync(inode);
4920                                 goto next;
4921                         }
4922                         hole_em->start = cur_offset;
4923                         hole_em->len = hole_size;
4924                         hole_em->orig_start = cur_offset;
4925
4926                         hole_em->block_start = EXTENT_MAP_HOLE;
4927                         hole_em->block_len = 0;
4928                         hole_em->orig_block_len = 0;
4929                         hole_em->ram_bytes = hole_size;
4930                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
4931                         hole_em->generation = fs_info->generation;
4932
4933                         while (1) {
4934                                 write_lock(&em_tree->lock);
4935                                 err = add_extent_mapping(em_tree, hole_em, 1);
4936                                 write_unlock(&em_tree->lock);
4937                                 if (err != -EEXIST)
4938                                         break;
4939                                 btrfs_drop_extent_cache(inode, cur_offset,
4940                                                         cur_offset +
4941                                                         hole_size - 1, 0);
4942                         }
4943                         free_extent_map(hole_em);
4944                 } else {
4945                         err = btrfs_inode_set_file_extent_range(inode,
4946                                                         cur_offset, hole_size);
4947                         if (err)
4948                                 break;
4949                 }
4950 next:
4951                 free_extent_map(em);
4952                 em = NULL;
4953                 cur_offset = last_byte;
4954                 if (cur_offset >= block_end)
4955                         break;
4956         }
4957         free_extent_map(em);
4958         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
4959         return err;
4960 }
4961
4962 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4963 {
4964         struct btrfs_root *root = BTRFS_I(inode)->root;
4965         struct btrfs_trans_handle *trans;
4966         loff_t oldsize = i_size_read(inode);
4967         loff_t newsize = attr->ia_size;
4968         int mask = attr->ia_valid;
4969         int ret;
4970
4971         /*
4972          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4973          * special case where we need to update the times despite not having
4974          * these flags set.  For all other operations the VFS set these flags
4975          * explicitly if it wants a timestamp update.
4976          */
4977         if (newsize != oldsize) {
4978                 inode_inc_iversion(inode);
4979                 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4980                         inode->i_ctime = inode->i_mtime =
4981                                 current_time(inode);
4982         }
4983
4984         if (newsize > oldsize) {
4985                 /*
4986                  * Don't do an expanding truncate while snapshotting is ongoing.
4987                  * This is to ensure the snapshot captures a fully consistent
4988                  * state of this file - if the snapshot captures this expanding
4989                  * truncation, it must capture all writes that happened before
4990                  * this truncation.
4991                  */
4992                 btrfs_drew_write_lock(&root->snapshot_lock);
4993                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
4994                 if (ret) {
4995                         btrfs_drew_write_unlock(&root->snapshot_lock);
4996                         return ret;
4997                 }
4998
4999                 trans = btrfs_start_transaction(root, 1);
5000                 if (IS_ERR(trans)) {
5001                         btrfs_drew_write_unlock(&root->snapshot_lock);
5002                         return PTR_ERR(trans);
5003                 }
5004
5005                 i_size_write(inode, newsize);
5006                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5007                 pagecache_isize_extended(inode, oldsize, newsize);
5008                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5009                 btrfs_drew_write_unlock(&root->snapshot_lock);
5010                 btrfs_end_transaction(trans);
5011         } else {
5012                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5013
5014                 if (btrfs_is_zoned(fs_info)) {
5015                         ret = btrfs_wait_ordered_range(inode,
5016                                         ALIGN(newsize, fs_info->sectorsize),
5017                                         (u64)-1);
5018                         if (ret)
5019                                 return ret;
5020                 }
5021
5022                 /*
5023                  * We're truncating a file that used to have good data down to
5024                  * zero. Make sure any new writes to the file get on disk
5025                  * on close.
5026                  */
5027                 if (newsize == 0)
5028                         set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5029                                 &BTRFS_I(inode)->runtime_flags);
5030
5031                 truncate_setsize(inode, newsize);
5032
5033                 inode_dio_wait(inode);
5034
5035                 ret = btrfs_truncate(inode, newsize == oldsize);
5036                 if (ret && inode->i_nlink) {
5037                         int err;
5038
5039                         /*
5040                          * Truncate failed, so fix up the in-memory size. We
5041                          * adjusted disk_i_size down as we removed extents, so
5042                          * wait for disk_i_size to be stable and then update the
5043                          * in-memory size to match.
5044                          */
5045                         err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5046                         if (err)
5047                                 return err;
5048                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5049                 }
5050         }
5051
5052         return ret;
5053 }
5054
5055 static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
5056                          struct iattr *attr)
5057 {
5058         struct inode *inode = d_inode(dentry);
5059         struct btrfs_root *root = BTRFS_I(inode)->root;
5060         int err;
5061
5062         if (btrfs_root_readonly(root))
5063                 return -EROFS;
5064
5065         err = setattr_prepare(mnt_userns, dentry, attr);
5066         if (err)
5067                 return err;
5068
5069         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5070                 err = btrfs_setsize(inode, attr);
5071                 if (err)
5072                         return err;
5073         }
5074
5075         if (attr->ia_valid) {
5076                 setattr_copy(mnt_userns, inode, attr);
5077                 inode_inc_iversion(inode);
5078                 err = btrfs_dirty_inode(inode);
5079
5080                 if (!err && attr->ia_valid & ATTR_MODE)
5081                         err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
5082         }
5083
5084         return err;
5085 }
5086
5087 /*
5088  * While truncating the inode pages during eviction, we get the VFS
5089  * calling btrfs_invalidate_folio() against each folio of the inode. This
5090  * is slow because the calls to btrfs_invalidate_folio() result in a
5091  * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
5092  * which keep merging and splitting extent_state structures over and over,
5093  * wasting lots of time.
5094  *
5095  * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5096  * skip all those expensive operations on a per folio basis and do only
5097  * the ordered io finishing, while we release here the extent_map and
5098  * extent_state structures, without the excessive merging and splitting.
5099  */
5100 static void evict_inode_truncate_pages(struct inode *inode)
5101 {
5102         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5103         struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5104         struct rb_node *node;
5105
5106         ASSERT(inode->i_state & I_FREEING);
5107         truncate_inode_pages_final(&inode->i_data);
5108
5109         write_lock(&map_tree->lock);
5110         while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5111                 struct extent_map *em;
5112
5113                 node = rb_first_cached(&map_tree->map);
5114                 em = rb_entry(node, struct extent_map, rb_node);
5115                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5116                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5117                 remove_extent_mapping(map_tree, em);
5118                 free_extent_map(em);
5119                 if (need_resched()) {
5120                         write_unlock(&map_tree->lock);
5121                         cond_resched();
5122                         write_lock(&map_tree->lock);
5123                 }
5124         }
5125         write_unlock(&map_tree->lock);
5126
5127         /*
5128          * Keep looping until we have no more ranges in the io tree.
5129          * We can have ongoing bios started by readahead that have
5130          * their endio callback (extent_io.c:end_bio_extent_readpage)
5131          * still in progress (unlocked the pages in the bio but did not yet
5132          * unlocked the ranges in the io tree). Therefore this means some
5133          * ranges can still be locked and eviction started because before
5134          * submitting those bios, which are executed by a separate task (work
5135          * queue kthread), inode references (inode->i_count) were not taken
5136          * (which would be dropped in the end io callback of each bio).
5137          * Therefore here we effectively end up waiting for those bios and
5138          * anyone else holding locked ranges without having bumped the inode's
5139          * reference count - if we don't do it, when they access the inode's
5140          * io_tree to unlock a range it may be too late, leading to an
5141          * use-after-free issue.
5142          */
5143         spin_lock(&io_tree->lock);
5144         while (!RB_EMPTY_ROOT(&io_tree->state)) {
5145                 struct extent_state *state;
5146                 struct extent_state *cached_state = NULL;
5147                 u64 start;
5148                 u64 end;
5149                 unsigned state_flags;
5150
5151                 node = rb_first(&io_tree->state);
5152                 state = rb_entry(node, struct extent_state, rb_node);
5153                 start = state->start;
5154                 end = state->end;
5155                 state_flags = state->state;
5156                 spin_unlock(&io_tree->lock);
5157
5158                 lock_extent_bits(io_tree, start, end, &cached_state);
5159
5160                 /*
5161                  * If still has DELALLOC flag, the extent didn't reach disk,
5162                  * and its reserved space won't be freed by delayed_ref.
5163                  * So we need to free its reserved space here.
5164                  * (Refer to comment in btrfs_invalidate_folio, case 2)
5165                  *
5166                  * Note, end is the bytenr of last byte, so we need + 1 here.
5167                  */
5168                 if (state_flags & EXTENT_DELALLOC)
5169                         btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5170                                                end - start + 1);
5171
5172                 clear_extent_bit(io_tree, start, end,
5173                                  EXTENT_LOCKED | EXTENT_DELALLOC |
5174                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
5175                                  &cached_state);
5176
5177                 cond_resched();
5178                 spin_lock(&io_tree->lock);
5179         }
5180         spin_unlock(&io_tree->lock);
5181 }
5182
5183 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5184                                                         struct btrfs_block_rsv *rsv)
5185 {
5186         struct btrfs_fs_info *fs_info = root->fs_info;
5187         struct btrfs_trans_handle *trans;
5188         u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
5189         int ret;
5190
5191         /*
5192          * Eviction should be taking place at some place safe because of our
5193          * delayed iputs.  However the normal flushing code will run delayed
5194          * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5195          *
5196          * We reserve the delayed_refs_extra here again because we can't use
5197          * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5198          * above.  We reserve our extra bit here because we generate a ton of
5199          * delayed refs activity by truncating.
5200          *
5201          * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5202          * if we fail to make this reservation we can re-try without the
5203          * delayed_refs_extra so we can make some forward progress.
5204          */
5205         ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5206                                      BTRFS_RESERVE_FLUSH_EVICT);
5207         if (ret) {
5208                 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5209                                              BTRFS_RESERVE_FLUSH_EVICT);
5210                 if (ret) {
5211                         btrfs_warn(fs_info,
5212                                    "could not allocate space for delete; will truncate on mount");
5213                         return ERR_PTR(-ENOSPC);
5214                 }
5215                 delayed_refs_extra = 0;
5216         }
5217
5218         trans = btrfs_join_transaction(root);
5219         if (IS_ERR(trans))
5220                 return trans;
5221
5222         if (delayed_refs_extra) {
5223                 trans->block_rsv = &fs_info->trans_block_rsv;
5224                 trans->bytes_reserved = delayed_refs_extra;
5225                 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5226                                         delayed_refs_extra, 1);
5227         }
5228         return trans;
5229 }
5230
5231 void btrfs_evict_inode(struct inode *inode)
5232 {
5233         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5234         struct btrfs_trans_handle *trans;
5235         struct btrfs_root *root = BTRFS_I(inode)->root;
5236         struct btrfs_block_rsv *rsv;
5237         int ret;
5238
5239         trace_btrfs_inode_evict(inode);
5240
5241         if (!root) {
5242                 fsverity_cleanup_inode(inode);
5243                 clear_inode(inode);
5244                 return;
5245         }
5246
5247         evict_inode_truncate_pages(inode);
5248
5249         if (inode->i_nlink &&
5250             ((btrfs_root_refs(&root->root_item) != 0 &&
5251               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5252              btrfs_is_free_space_inode(BTRFS_I(inode))))
5253                 goto no_delete;
5254
5255         if (is_bad_inode(inode))
5256                 goto no_delete;
5257
5258         btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5259
5260         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5261                 goto no_delete;
5262
5263         if (inode->i_nlink > 0) {
5264                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5265                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5266                 goto no_delete;
5267         }
5268
5269         /*
5270          * This makes sure the inode item in tree is uptodate and the space for
5271          * the inode update is released.
5272          */
5273         ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5274         if (ret)
5275                 goto no_delete;
5276
5277         /*
5278          * This drops any pending insert or delete operations we have for this
5279          * inode.  We could have a delayed dir index deletion queued up, but
5280          * we're removing the inode completely so that'll be taken care of in
5281          * the truncate.
5282          */
5283         btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5284
5285         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5286         if (!rsv)
5287                 goto no_delete;
5288         rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5289         rsv->failfast = 1;
5290
5291         btrfs_i_size_write(BTRFS_I(inode), 0);
5292
5293         while (1) {
5294                 struct btrfs_truncate_control control = {
5295                         .inode = BTRFS_I(inode),
5296                         .ino = btrfs_ino(BTRFS_I(inode)),
5297                         .new_size = 0,
5298                         .min_type = 0,
5299                 };
5300
5301                 trans = evict_refill_and_join(root, rsv);
5302                 if (IS_ERR(trans))
5303                         goto free_rsv;
5304
5305                 trans->block_rsv = rsv;
5306
5307                 ret = btrfs_truncate_inode_items(trans, root, &control);
5308                 trans->block_rsv = &fs_info->trans_block_rsv;
5309                 btrfs_end_transaction(trans);
5310                 btrfs_btree_balance_dirty(fs_info);
5311                 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5312                         goto free_rsv;
5313                 else if (!ret)
5314                         break;
5315         }
5316
5317         /*
5318          * Errors here aren't a big deal, it just means we leave orphan items in
5319          * the tree. They will be cleaned up on the next mount. If the inode
5320          * number gets reused, cleanup deletes the orphan item without doing
5321          * anything, and unlink reuses the existing orphan item.
5322          *
5323          * If it turns out that we are dropping too many of these, we might want
5324          * to add a mechanism for retrying these after a commit.
5325          */
5326         trans = evict_refill_and_join(root, rsv);
5327         if (!IS_ERR(trans)) {
5328                 trans->block_rsv = rsv;
5329                 btrfs_orphan_del(trans, BTRFS_I(inode));
5330                 trans->block_rsv = &fs_info->trans_block_rsv;
5331                 btrfs_end_transaction(trans);
5332         }
5333
5334 free_rsv:
5335         btrfs_free_block_rsv(fs_info, rsv);
5336 no_delete:
5337         /*
5338          * If we didn't successfully delete, the orphan item will still be in
5339          * the tree and we'll retry on the next mount. Again, we might also want
5340          * to retry these periodically in the future.
5341          */
5342         btrfs_remove_delayed_node(BTRFS_I(inode));
5343         fsverity_cleanup_inode(inode);
5344         clear_inode(inode);
5345 }
5346
5347 /*
5348  * Return the key found in the dir entry in the location pointer, fill @type
5349  * with BTRFS_FT_*, and return 0.
5350  *
5351  * If no dir entries were found, returns -ENOENT.
5352  * If found a corrupted location in dir entry, returns -EUCLEAN.
5353  */
5354 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5355                                struct btrfs_key *location, u8 *type)
5356 {
5357         const char *name = dentry->d_name.name;
5358         int namelen = dentry->d_name.len;
5359         struct btrfs_dir_item *di;
5360         struct btrfs_path *path;
5361         struct btrfs_root *root = BTRFS_I(dir)->root;
5362         int ret = 0;
5363
5364         path = btrfs_alloc_path();
5365         if (!path)
5366                 return -ENOMEM;
5367
5368         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5369                         name, namelen, 0);
5370         if (IS_ERR_OR_NULL(di)) {
5371                 ret = di ? PTR_ERR(di) : -ENOENT;
5372                 goto out;
5373         }
5374
5375         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5376         if (location->type != BTRFS_INODE_ITEM_KEY &&
5377             location->type != BTRFS_ROOT_ITEM_KEY) {
5378                 ret = -EUCLEAN;
5379                 btrfs_warn(root->fs_info,
5380 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5381                            __func__, name, btrfs_ino(BTRFS_I(dir)),
5382                            location->objectid, location->type, location->offset);
5383         }
5384         if (!ret)
5385                 *type = btrfs_dir_type(path->nodes[0], di);
5386 out:
5387         btrfs_free_path(path);
5388         return ret;
5389 }
5390
5391 /*
5392  * when we hit a tree root in a directory, the btrfs part of the inode
5393  * needs to be changed to reflect the root directory of the tree root.  This
5394  * is kind of like crossing a mount point.
5395  */
5396 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5397                                     struct inode *dir,
5398                                     struct dentry *dentry,
5399                                     struct btrfs_key *location,
5400                                     struct btrfs_root **sub_root)
5401 {
5402         struct btrfs_path *path;
5403         struct btrfs_root *new_root;
5404         struct btrfs_root_ref *ref;
5405         struct extent_buffer *leaf;
5406         struct btrfs_key key;
5407         int ret;
5408         int err = 0;
5409
5410         path = btrfs_alloc_path();
5411         if (!path) {
5412                 err = -ENOMEM;
5413                 goto out;
5414         }
5415
5416         err = -ENOENT;
5417         key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5418         key.type = BTRFS_ROOT_REF_KEY;
5419         key.offset = location->objectid;
5420
5421         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5422         if (ret) {
5423                 if (ret < 0)
5424                         err = ret;
5425                 goto out;
5426         }
5427
5428         leaf = path->nodes[0];
5429         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5430         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5431             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5432                 goto out;
5433
5434         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5435                                    (unsigned long)(ref + 1),
5436                                    dentry->d_name.len);
5437         if (ret)
5438                 goto out;
5439
5440         btrfs_release_path(path);
5441
5442         new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5443         if (IS_ERR(new_root)) {
5444                 err = PTR_ERR(new_root);
5445                 goto out;
5446         }
5447
5448         *sub_root = new_root;
5449         location->objectid = btrfs_root_dirid(&new_root->root_item);
5450         location->type = BTRFS_INODE_ITEM_KEY;
5451         location->offset = 0;
5452         err = 0;
5453 out:
5454         btrfs_free_path(path);
5455         return err;
5456 }
5457
5458 static void inode_tree_add(struct inode *inode)
5459 {
5460         struct btrfs_root *root = BTRFS_I(inode)->root;
5461         struct btrfs_inode *entry;
5462         struct rb_node **p;
5463         struct rb_node *parent;
5464         struct rb_node *new = &BTRFS_I(inode)->rb_node;
5465         u64 ino = btrfs_ino(BTRFS_I(inode));
5466
5467         if (inode_unhashed(inode))
5468                 return;
5469         parent = NULL;
5470         spin_lock(&root->inode_lock);
5471         p = &root->inode_tree.rb_node;
5472         while (*p) {
5473                 parent = *p;
5474                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
5475
5476                 if (ino < btrfs_ino(entry))
5477                         p = &parent->rb_left;
5478                 else if (ino > btrfs_ino(entry))
5479                         p = &parent->rb_right;
5480                 else {
5481                         WARN_ON(!(entry->vfs_inode.i_state &
5482                                   (I_WILL_FREE | I_FREEING)));
5483                         rb_replace_node(parent, new, &root->inode_tree);
5484                         RB_CLEAR_NODE(parent);
5485                         spin_unlock(&root->inode_lock);
5486                         return;
5487                 }
5488         }
5489         rb_link_node(new, parent, p);
5490         rb_insert_color(new, &root->inode_tree);
5491         spin_unlock(&root->inode_lock);
5492 }
5493
5494 static void inode_tree_del(struct btrfs_inode *inode)
5495 {
5496         struct btrfs_root *root = inode->root;
5497         int empty = 0;
5498
5499         spin_lock(&root->inode_lock);
5500         if (!RB_EMPTY_NODE(&inode->rb_node)) {
5501                 rb_erase(&inode->rb_node, &root->inode_tree);
5502                 RB_CLEAR_NODE(&inode->rb_node);
5503                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5504         }
5505         spin_unlock(&root->inode_lock);
5506
5507         if (empty && btrfs_root_refs(&root->root_item) == 0) {
5508                 spin_lock(&root->inode_lock);
5509                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5510                 spin_unlock(&root->inode_lock);
5511                 if (empty)
5512                         btrfs_add_dead_root(root);
5513         }
5514 }
5515
5516
5517 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5518 {
5519         struct btrfs_iget_args *args = p;
5520
5521         inode->i_ino = args->ino;
5522         BTRFS_I(inode)->location.objectid = args->ino;
5523         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5524         BTRFS_I(inode)->location.offset = 0;
5525         BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5526         BUG_ON(args->root && !BTRFS_I(inode)->root);
5527         return 0;
5528 }
5529
5530 static int btrfs_find_actor(struct inode *inode, void *opaque)
5531 {
5532         struct btrfs_iget_args *args = opaque;
5533
5534         return args->ino == BTRFS_I(inode)->location.objectid &&
5535                 args->root == BTRFS_I(inode)->root;
5536 }
5537
5538 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5539                                        struct btrfs_root *root)
5540 {
5541         struct inode *inode;
5542         struct btrfs_iget_args args;
5543         unsigned long hashval = btrfs_inode_hash(ino, root);
5544
5545         args.ino = ino;
5546         args.root = root;
5547
5548         inode = iget5_locked(s, hashval, btrfs_find_actor,
5549                              btrfs_init_locked_inode,
5550                              (void *)&args);
5551         return inode;
5552 }
5553
5554 /*
5555  * Get an inode object given its inode number and corresponding root.
5556  * Path can be preallocated to prevent recursing back to iget through
5557  * allocator. NULL is also valid but may require an additional allocation
5558  * later.
5559  */
5560 struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5561                               struct btrfs_root *root, struct btrfs_path *path)
5562 {
5563         struct inode *inode;
5564
5565         inode = btrfs_iget_locked(s, ino, root);
5566         if (!inode)
5567                 return ERR_PTR(-ENOMEM);
5568
5569         if (inode->i_state & I_NEW) {
5570                 int ret;
5571
5572                 ret = btrfs_read_locked_inode(inode, path);
5573                 if (!ret) {
5574                         inode_tree_add(inode);
5575                         unlock_new_inode(inode);
5576                 } else {
5577                         iget_failed(inode);
5578                         /*
5579                          * ret > 0 can come from btrfs_search_slot called by
5580                          * btrfs_read_locked_inode, this means the inode item
5581                          * was not found.
5582                          */
5583                         if (ret > 0)
5584                                 ret = -ENOENT;
5585                         inode = ERR_PTR(ret);
5586                 }
5587         }
5588
5589         return inode;
5590 }
5591
5592 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5593 {
5594         return btrfs_iget_path(s, ino, root, NULL);
5595 }
5596
5597 static struct inode *new_simple_dir(struct super_block *s,
5598                                     struct btrfs_key *key,
5599                                     struct btrfs_root *root)
5600 {
5601         struct inode *inode = new_inode(s);
5602
5603         if (!inode)
5604                 return ERR_PTR(-ENOMEM);
5605
5606         BTRFS_I(inode)->root = btrfs_grab_root(root);
5607         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5608         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5609
5610         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5611         /*
5612          * We only need lookup, the rest is read-only and there's no inode
5613          * associated with the dentry
5614          */
5615         inode->i_op = &simple_dir_inode_operations;
5616         inode->i_opflags &= ~IOP_XATTR;
5617         inode->i_fop = &simple_dir_operations;
5618         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5619         inode->i_mtime = current_time(inode);
5620         inode->i_atime = inode->i_mtime;
5621         inode->i_ctime = inode->i_mtime;
5622         BTRFS_I(inode)->i_otime = inode->i_mtime;
5623
5624         return inode;
5625 }
5626
5627 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5628 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5629 static_assert(BTRFS_FT_DIR == FT_DIR);
5630 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5631 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5632 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5633 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5634 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5635
5636 static inline u8 btrfs_inode_type(struct inode *inode)
5637 {
5638         return fs_umode_to_ftype(inode->i_mode);
5639 }
5640
5641 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5642 {
5643         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5644         struct inode *inode;
5645         struct btrfs_root *root = BTRFS_I(dir)->root;
5646         struct btrfs_root *sub_root = root;
5647         struct btrfs_key location;
5648         u8 di_type = 0;
5649         int ret = 0;
5650
5651         if (dentry->d_name.len > BTRFS_NAME_LEN)
5652                 return ERR_PTR(-ENAMETOOLONG);
5653
5654         ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
5655         if (ret < 0)
5656                 return ERR_PTR(ret);
5657
5658         if (location.type == BTRFS_INODE_ITEM_KEY) {
5659                 inode = btrfs_iget(dir->i_sb, location.objectid, root);
5660                 if (IS_ERR(inode))
5661                         return inode;
5662
5663                 /* Do extra check against inode mode with di_type */
5664                 if (btrfs_inode_type(inode) != di_type) {
5665                         btrfs_crit(fs_info,
5666 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5667                                   inode->i_mode, btrfs_inode_type(inode),
5668                                   di_type);
5669                         iput(inode);
5670                         return ERR_PTR(-EUCLEAN);
5671                 }
5672                 return inode;
5673         }
5674
5675         ret = fixup_tree_root_location(fs_info, dir, dentry,
5676                                        &location, &sub_root);
5677         if (ret < 0) {
5678                 if (ret != -ENOENT)
5679                         inode = ERR_PTR(ret);
5680                 else
5681                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
5682         } else {
5683                 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5684         }
5685         if (root != sub_root)
5686                 btrfs_put_root(sub_root);
5687
5688         if (!IS_ERR(inode) && root != sub_root) {
5689                 down_read(&fs_info->cleanup_work_sem);
5690                 if (!sb_rdonly(inode->i_sb))
5691                         ret = btrfs_orphan_cleanup(sub_root);
5692                 up_read(&fs_info->cleanup_work_sem);
5693                 if (ret) {
5694                         iput(inode);
5695                         inode = ERR_PTR(ret);
5696                 }
5697         }
5698
5699         return inode;
5700 }
5701
5702 static int btrfs_dentry_delete(const struct dentry *dentry)
5703 {
5704         struct btrfs_root *root;
5705         struct inode *inode = d_inode(dentry);
5706
5707         if (!inode && !IS_ROOT(dentry))
5708                 inode = d_inode(dentry->d_parent);
5709
5710         if (inode) {
5711                 root = BTRFS_I(inode)->root;
5712                 if (btrfs_root_refs(&root->root_item) == 0)
5713                         return 1;
5714
5715                 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5716                         return 1;
5717         }
5718         return 0;
5719 }
5720
5721 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5722                                    unsigned int flags)
5723 {
5724         struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5725
5726         if (inode == ERR_PTR(-ENOENT))
5727                 inode = NULL;
5728         return d_splice_alias(inode, dentry);
5729 }
5730
5731 /*
5732  * All this infrastructure exists because dir_emit can fault, and we are holding
5733  * the tree lock when doing readdir.  For now just allocate a buffer and copy
5734  * our information into that, and then dir_emit from the buffer.  This is
5735  * similar to what NFS does, only we don't keep the buffer around in pagecache
5736  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5737  * copy_to_user_inatomic so we don't have to worry about page faulting under the
5738  * tree lock.
5739  */
5740 static int btrfs_opendir(struct inode *inode, struct file *file)
5741 {
5742         struct btrfs_file_private *private;
5743
5744         private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5745         if (!private)
5746                 return -ENOMEM;
5747         private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5748         if (!private->filldir_buf) {
5749                 kfree(private);
5750                 return -ENOMEM;
5751         }
5752         file->private_data = private;
5753         return 0;
5754 }
5755
5756 struct dir_entry {
5757         u64 ino;
5758         u64 offset;
5759         unsigned type;
5760         int name_len;
5761 };
5762
5763 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5764 {
5765         while (entries--) {
5766                 struct dir_entry *entry = addr;
5767                 char *name = (char *)(entry + 1);
5768
5769                 ctx->pos = get_unaligned(&entry->offset);
5770                 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5771                                          get_unaligned(&entry->ino),
5772                                          get_unaligned(&entry->type)))
5773                         return 1;
5774                 addr += sizeof(struct dir_entry) +
5775                         get_unaligned(&entry->name_len);
5776                 ctx->pos++;
5777         }
5778         return 0;
5779 }
5780
5781 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5782 {
5783         struct inode *inode = file_inode(file);
5784         struct btrfs_root *root = BTRFS_I(inode)->root;
5785         struct btrfs_file_private *private = file->private_data;
5786         struct btrfs_dir_item *di;
5787         struct btrfs_key key;
5788         struct btrfs_key found_key;
5789         struct btrfs_path *path;
5790         void *addr;
5791         struct list_head ins_list;
5792         struct list_head del_list;
5793         int ret;
5794         struct extent_buffer *leaf;
5795         int slot;
5796         char *name_ptr;
5797         int name_len;
5798         int entries = 0;
5799         int total_len = 0;
5800         bool put = false;
5801         struct btrfs_key location;
5802
5803         if (!dir_emit_dots(file, ctx))
5804                 return 0;
5805
5806         path = btrfs_alloc_path();
5807         if (!path)
5808                 return -ENOMEM;
5809
5810         addr = private->filldir_buf;
5811         path->reada = READA_FORWARD;
5812
5813         INIT_LIST_HEAD(&ins_list);
5814         INIT_LIST_HEAD(&del_list);
5815         put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5816
5817 again:
5818         key.type = BTRFS_DIR_INDEX_KEY;
5819         key.offset = ctx->pos;
5820         key.objectid = btrfs_ino(BTRFS_I(inode));
5821
5822         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5823         if (ret < 0)
5824                 goto err;
5825
5826         while (1) {
5827                 struct dir_entry *entry;
5828
5829                 leaf = path->nodes[0];
5830                 slot = path->slots[0];
5831                 if (slot >= btrfs_header_nritems(leaf)) {
5832                         ret = btrfs_next_leaf(root, path);
5833                         if (ret < 0)
5834                                 goto err;
5835                         else if (ret > 0)
5836                                 break;
5837                         continue;
5838                 }
5839
5840                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5841
5842                 if (found_key.objectid != key.objectid)
5843                         break;
5844                 if (found_key.type != BTRFS_DIR_INDEX_KEY)
5845                         break;
5846                 if (found_key.offset < ctx->pos)
5847                         goto next;
5848                 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5849                         goto next;
5850                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5851                 name_len = btrfs_dir_name_len(leaf, di);
5852                 if ((total_len + sizeof(struct dir_entry) + name_len) >=
5853                     PAGE_SIZE) {
5854                         btrfs_release_path(path);
5855                         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5856                         if (ret)
5857                                 goto nopos;
5858                         addr = private->filldir_buf;
5859                         entries = 0;
5860                         total_len = 0;
5861                         goto again;
5862                 }
5863
5864                 entry = addr;
5865                 put_unaligned(name_len, &entry->name_len);
5866                 name_ptr = (char *)(entry + 1);
5867                 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
5868                                    name_len);
5869                 put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
5870                                 &entry->type);
5871                 btrfs_dir_item_key_to_cpu(leaf, di, &location);
5872                 put_unaligned(location.objectid, &entry->ino);
5873                 put_unaligned(found_key.offset, &entry->offset);
5874                 entries++;
5875                 addr += sizeof(struct dir_entry) + name_len;
5876                 total_len += sizeof(struct dir_entry) + name_len;
5877 next:
5878                 path->slots[0]++;
5879         }
5880         btrfs_release_path(path);
5881
5882         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5883         if (ret)
5884                 goto nopos;
5885
5886         ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5887         if (ret)
5888                 goto nopos;
5889
5890         /*
5891          * Stop new entries from being returned after we return the last
5892          * entry.
5893          *
5894          * New directory entries are assigned a strictly increasing
5895          * offset.  This means that new entries created during readdir
5896          * are *guaranteed* to be seen in the future by that readdir.
5897          * This has broken buggy programs which operate on names as
5898          * they're returned by readdir.  Until we re-use freed offsets
5899          * we have this hack to stop new entries from being returned
5900          * under the assumption that they'll never reach this huge
5901          * offset.
5902          *
5903          * This is being careful not to overflow 32bit loff_t unless the
5904          * last entry requires it because doing so has broken 32bit apps
5905          * in the past.
5906          */
5907         if (ctx->pos >= INT_MAX)
5908                 ctx->pos = LLONG_MAX;
5909         else
5910                 ctx->pos = INT_MAX;
5911 nopos:
5912         ret = 0;
5913 err:
5914         if (put)
5915                 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
5916         btrfs_free_path(path);
5917         return ret;
5918 }
5919
5920 /*
5921  * This is somewhat expensive, updating the tree every time the
5922  * inode changes.  But, it is most likely to find the inode in cache.
5923  * FIXME, needs more benchmarking...there are no reasons other than performance
5924  * to keep or drop this code.
5925  */
5926 static int btrfs_dirty_inode(struct inode *inode)
5927 {
5928         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5929         struct btrfs_root *root = BTRFS_I(inode)->root;
5930         struct btrfs_trans_handle *trans;
5931         int ret;
5932
5933         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5934                 return 0;
5935
5936         trans = btrfs_join_transaction(root);
5937         if (IS_ERR(trans))
5938                 return PTR_ERR(trans);
5939
5940         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5941         if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
5942                 /* whoops, lets try again with the full transaction */
5943                 btrfs_end_transaction(trans);
5944                 trans = btrfs_start_transaction(root, 1);
5945                 if (IS_ERR(trans))
5946                         return PTR_ERR(trans);
5947
5948                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5949         }
5950         btrfs_end_transaction(trans);
5951         if (BTRFS_I(inode)->delayed_node)
5952                 btrfs_balance_delayed_items(fs_info);
5953
5954         return ret;
5955 }
5956
5957 /*
5958  * This is a copy of file_update_time.  We need this so we can return error on
5959  * ENOSPC for updating the inode in the case of file write and mmap writes.
5960  */
5961 static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
5962                              int flags)
5963 {
5964         struct btrfs_root *root = BTRFS_I(inode)->root;
5965         bool dirty = flags & ~S_VERSION;
5966
5967         if (btrfs_root_readonly(root))
5968                 return -EROFS;
5969
5970         if (flags & S_VERSION)
5971                 dirty |= inode_maybe_inc_iversion(inode, dirty);
5972         if (flags & S_CTIME)
5973                 inode->i_ctime = *now;
5974         if (flags & S_MTIME)
5975                 inode->i_mtime = *now;
5976         if (flags & S_ATIME)
5977                 inode->i_atime = *now;
5978         return dirty ? btrfs_dirty_inode(inode) : 0;
5979 }
5980
5981 /*
5982  * find the highest existing sequence number in a directory
5983  * and then set the in-memory index_cnt variable to reflect
5984  * free sequence numbers
5985  */
5986 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5987 {
5988         struct btrfs_root *root = inode->root;
5989         struct btrfs_key key, found_key;
5990         struct btrfs_path *path;
5991         struct extent_buffer *leaf;
5992         int ret;
5993
5994         key.objectid = btrfs_ino(inode);
5995         key.type = BTRFS_DIR_INDEX_KEY;
5996         key.offset = (u64)-1;
5997
5998         path = btrfs_alloc_path();
5999         if (!path)
6000                 return -ENOMEM;
6001
6002         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6003         if (ret < 0)
6004                 goto out;
6005         /* FIXME: we should be able to handle this */
6006         if (ret == 0)
6007                 goto out;
6008         ret = 0;
6009
6010         if (path->slots[0] == 0) {
6011                 inode->index_cnt = BTRFS_DIR_START_INDEX;
6012                 goto out;
6013         }
6014
6015         path->slots[0]--;
6016
6017         leaf = path->nodes[0];
6018         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6019
6020         if (found_key.objectid != btrfs_ino(inode) ||
6021             found_key.type != BTRFS_DIR_INDEX_KEY) {
6022                 inode->index_cnt = BTRFS_DIR_START_INDEX;
6023                 goto out;
6024         }
6025
6026         inode->index_cnt = found_key.offset + 1;
6027 out:
6028         btrfs_free_path(path);
6029         return ret;
6030 }
6031
6032 /*
6033  * helper to find a free sequence number in a given directory.  This current
6034  * code is very simple, later versions will do smarter things in the btree
6035  */
6036 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6037 {
6038         int ret = 0;
6039
6040         if (dir->index_cnt == (u64)-1) {
6041                 ret = btrfs_inode_delayed_dir_index_count(dir);
6042                 if (ret) {
6043                         ret = btrfs_set_inode_index_count(dir);
6044                         if (ret)
6045                                 return ret;
6046                 }
6047         }
6048
6049         *index = dir->index_cnt;
6050         dir->index_cnt++;
6051
6052         return ret;
6053 }
6054
6055 static int btrfs_insert_inode_locked(struct inode *inode)
6056 {
6057         struct btrfs_iget_args args;
6058
6059         args.ino = BTRFS_I(inode)->location.objectid;
6060         args.root = BTRFS_I(inode)->root;
6061
6062         return insert_inode_locked4(inode,
6063                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6064                    btrfs_find_actor, &args);
6065 }
6066
6067 /*
6068  * Inherit flags from the parent inode.
6069  *
6070  * Currently only the compression flags and the cow flags are inherited.
6071  */
6072 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
6073 {
6074         unsigned int flags;
6075
6076         if (!dir)
6077                 return;
6078
6079         flags = BTRFS_I(dir)->flags;
6080
6081         if (flags & BTRFS_INODE_NOCOMPRESS) {
6082                 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
6083                 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
6084         } else if (flags & BTRFS_INODE_COMPRESS) {
6085                 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
6086                 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
6087         }
6088
6089         if (flags & BTRFS_INODE_NODATACOW) {
6090                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
6091                 if (S_ISREG(inode->i_mode))
6092                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6093         }
6094
6095         btrfs_sync_inode_flags_to_i_flags(inode);
6096 }
6097
6098 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6099                                      struct btrfs_root *root,
6100                                      struct user_namespace *mnt_userns,
6101                                      struct inode *dir,
6102                                      const char *name, int name_len,
6103                                      u64 ref_objectid, u64 objectid,
6104                                      umode_t mode, u64 *index)
6105 {
6106         struct btrfs_fs_info *fs_info = root->fs_info;
6107         struct inode *inode;
6108         struct btrfs_inode_item *inode_item;
6109         struct btrfs_key *location;
6110         struct btrfs_path *path;
6111         struct btrfs_inode_ref *ref;
6112         struct btrfs_key key[2];
6113         u32 sizes[2];
6114         struct btrfs_item_batch batch;
6115         unsigned long ptr;
6116         unsigned int nofs_flag;
6117         int ret;
6118
6119         path = btrfs_alloc_path();
6120         if (!path)
6121                 return ERR_PTR(-ENOMEM);
6122
6123         nofs_flag = memalloc_nofs_save();
6124         inode = new_inode(fs_info->sb);
6125         memalloc_nofs_restore(nofs_flag);
6126         if (!inode) {
6127                 btrfs_free_path(path);
6128                 return ERR_PTR(-ENOMEM);
6129         }
6130
6131         /*
6132          * O_TMPFILE, set link count to 0, so that after this point,
6133          * we fill in an inode item with the correct link count.
6134          */
6135         if (!name)
6136                 set_nlink(inode, 0);
6137
6138         /*
6139          * we have to initialize this early, so we can reclaim the inode
6140          * number if we fail afterwards in this function.
6141          */
6142         inode->i_ino = objectid;
6143
6144         if (dir && name) {
6145                 trace_btrfs_inode_request(dir);
6146
6147                 ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6148                 if (ret) {
6149                         btrfs_free_path(path);
6150                         iput(inode);
6151                         return ERR_PTR(ret);
6152                 }
6153         } else if (dir) {
6154                 *index = 0;
6155         }
6156         /*
6157          * index_cnt is ignored for everything but a dir,
6158          * btrfs_set_inode_index_count has an explanation for the magic
6159          * number
6160          */
6161         BTRFS_I(inode)->index_cnt = 2;
6162         BTRFS_I(inode)->dir_index = *index;
6163         BTRFS_I(inode)->root = btrfs_grab_root(root);
6164         BTRFS_I(inode)->generation = trans->transid;
6165         inode->i_generation = BTRFS_I(inode)->generation;
6166
6167         /*
6168          * We could have gotten an inode number from somebody who was fsynced
6169          * and then removed in this same transaction, so let's just set full
6170          * sync since it will be a full sync anyway and this will blow away the
6171          * old info in the log.
6172          */
6173         btrfs_set_inode_full_sync(BTRFS_I(inode));
6174
6175         key[0].objectid = objectid;
6176         key[0].type = BTRFS_INODE_ITEM_KEY;
6177         key[0].offset = 0;
6178
6179         sizes[0] = sizeof(struct btrfs_inode_item);
6180
6181         if (name) {
6182                 /*
6183                  * Start new inodes with an inode_ref. This is slightly more
6184                  * efficient for small numbers of hard links since they will
6185                  * be packed into one item. Extended refs will kick in if we
6186                  * add more hard links than can fit in the ref item.
6187                  */
6188                 key[1].objectid = objectid;
6189                 key[1].type = BTRFS_INODE_REF_KEY;
6190                 key[1].offset = ref_objectid;
6191
6192                 sizes[1] = name_len + sizeof(*ref);
6193         }
6194
6195         location = &BTRFS_I(inode)->location;
6196         location->objectid = objectid;
6197         location->offset = 0;
6198         location->type = BTRFS_INODE_ITEM_KEY;
6199
6200         ret = btrfs_insert_inode_locked(inode);
6201         if (ret < 0) {
6202                 iput(inode);
6203                 goto fail;
6204         }
6205
6206         batch.keys = &key[0];
6207         batch.data_sizes = &sizes[0];
6208         batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
6209         batch.nr = name ? 2 : 1;
6210         ret = btrfs_insert_empty_items(trans, root, path, &batch);
6211         if (ret != 0)
6212                 goto fail_unlock;
6213
6214         inode_init_owner(mnt_userns, inode, dir, mode);
6215         inode_set_bytes(inode, 0);
6216
6217         inode->i_mtime = current_time(inode);
6218         inode->i_atime = inode->i_mtime;
6219         inode->i_ctime = inode->i_mtime;
6220         BTRFS_I(inode)->i_otime = inode->i_mtime;
6221
6222         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6223                                   struct btrfs_inode_item);
6224         memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6225                              sizeof(*inode_item));
6226         fill_inode_item(trans, path->nodes[0], inode_item, inode);
6227
6228         if (name) {
6229                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6230                                      struct btrfs_inode_ref);
6231                 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6232                 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6233                 ptr = (unsigned long)(ref + 1);
6234                 write_extent_buffer(path->nodes[0], name, ptr, name_len);
6235         }
6236
6237         btrfs_mark_buffer_dirty(path->nodes[0]);
6238         btrfs_free_path(path);
6239
6240         btrfs_inherit_iflags(inode, dir);
6241
6242         if (S_ISREG(mode)) {
6243                 if (btrfs_test_opt(fs_info, NODATASUM))
6244                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6245                 if (btrfs_test_opt(fs_info, NODATACOW))
6246                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6247                                 BTRFS_INODE_NODATASUM;
6248         }
6249
6250         inode_tree_add(inode);
6251
6252         trace_btrfs_inode_new(inode);
6253         btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6254
6255         btrfs_update_root_times(trans, root);
6256
6257         ret = btrfs_inode_inherit_props(trans, inode, dir);
6258         if (ret)
6259                 btrfs_err(fs_info,
6260                           "error inheriting props for ino %llu (root %llu): %d",
6261                         btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6262
6263         return inode;
6264
6265 fail_unlock:
6266         discard_new_inode(inode);
6267 fail:
6268         if (dir && name)
6269                 BTRFS_I(dir)->index_cnt--;
6270         btrfs_free_path(path);
6271         return ERR_PTR(ret);
6272 }
6273
6274 /*
6275  * utility function to add 'inode' into 'parent_inode' with
6276  * a give name and a given sequence number.
6277  * if 'add_backref' is true, also insert a backref from the
6278  * inode to the parent directory.
6279  */
6280 int btrfs_add_link(struct btrfs_trans_handle *trans,
6281                    struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6282                    const char *name, int name_len, int add_backref, u64 index)
6283 {
6284         int ret = 0;
6285         struct btrfs_key key;
6286         struct btrfs_root *root = parent_inode->root;
6287         u64 ino = btrfs_ino(inode);
6288         u64 parent_ino = btrfs_ino(parent_inode);
6289
6290         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6291                 memcpy(&key, &inode->root->root_key, sizeof(key));
6292         } else {
6293                 key.objectid = ino;
6294                 key.type = BTRFS_INODE_ITEM_KEY;
6295                 key.offset = 0;
6296         }
6297
6298         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6299                 ret = btrfs_add_root_ref(trans, key.objectid,
6300                                          root->root_key.objectid, parent_ino,
6301                                          index, name, name_len);
6302         } else if (add_backref) {
6303                 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6304                                              parent_ino, index);
6305         }
6306
6307         /* Nothing to clean up yet */
6308         if (ret)
6309                 return ret;
6310
6311         ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
6312                                     btrfs_inode_type(&inode->vfs_inode), index);
6313         if (ret == -EEXIST || ret == -EOVERFLOW)
6314                 goto fail_dir_item;
6315         else if (ret) {
6316                 btrfs_abort_transaction(trans, ret);
6317                 return ret;
6318         }
6319
6320         btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6321                            name_len * 2);
6322         inode_inc_iversion(&parent_inode->vfs_inode);
6323         /*
6324          * If we are replaying a log tree, we do not want to update the mtime
6325          * and ctime of the parent directory with the current time, since the
6326          * log replay procedure is responsible for setting them to their correct
6327          * values (the ones it had when the fsync was done).
6328          */
6329         if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
6330                 struct timespec64 now = current_time(&parent_inode->vfs_inode);
6331
6332                 parent_inode->vfs_inode.i_mtime = now;
6333                 parent_inode->vfs_inode.i_ctime = now;
6334         }
6335         ret = btrfs_update_inode(trans, root, parent_inode);
6336         if (ret)
6337                 btrfs_abort_transaction(trans, ret);
6338         return ret;
6339
6340 fail_dir_item:
6341         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6342                 u64 local_index;
6343                 int err;
6344                 err = btrfs_del_root_ref(trans, key.objectid,
6345                                          root->root_key.objectid, parent_ino,
6346                                          &local_index, name, name_len);
6347                 if (err)
6348                         btrfs_abort_transaction(trans, err);
6349         } else if (add_backref) {
6350                 u64 local_index;
6351                 int err;
6352
6353                 err = btrfs_del_inode_ref(trans, root, name, name_len,
6354                                           ino, parent_ino, &local_index);
6355                 if (err)
6356                         btrfs_abort_transaction(trans, err);
6357         }
6358
6359         /* Return the original error code */
6360         return ret;
6361 }
6362
6363 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6364                             struct btrfs_inode *dir, struct dentry *dentry,
6365                             struct btrfs_inode *inode, int backref, u64 index)
6366 {
6367         int err = btrfs_add_link(trans, dir, inode,
6368                                  dentry->d_name.name, dentry->d_name.len,
6369                                  backref, index);
6370         if (err > 0)
6371                 err = -EEXIST;
6372         return err;
6373 }
6374
6375 static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
6376                        struct dentry *dentry, umode_t mode, dev_t rdev)
6377 {
6378         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6379         struct btrfs_trans_handle *trans;
6380         struct btrfs_root *root = BTRFS_I(dir)->root;
6381         struct inode *inode = NULL;
6382         int err;
6383         u64 objectid;
6384         u64 index = 0;
6385
6386         /*
6387          * 2 for inode item and ref
6388          * 2 for dir items
6389          * 1 for xattr if selinux is on
6390          */
6391         trans = btrfs_start_transaction(root, 5);
6392         if (IS_ERR(trans))
6393                 return PTR_ERR(trans);
6394
6395         err = btrfs_get_free_objectid(root, &objectid);
6396         if (err)
6397                 goto out_unlock;
6398
6399         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6400                         dentry->d_name.name, dentry->d_name.len,
6401                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
6402         if (IS_ERR(inode)) {
6403                 err = PTR_ERR(inode);
6404                 inode = NULL;
6405                 goto out_unlock;
6406         }
6407
6408         /*
6409         * If the active LSM wants to access the inode during
6410         * d_instantiate it needs these. Smack checks to see
6411         * if the filesystem supports xattrs by looking at the
6412         * ops vector.
6413         */
6414         inode->i_op = &btrfs_special_inode_operations;
6415         init_special_inode(inode, inode->i_mode, rdev);
6416
6417         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6418         if (err)
6419                 goto out_unlock;
6420
6421         err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6422                         0, index);
6423         if (err)
6424                 goto out_unlock;
6425
6426         btrfs_update_inode(trans, root, BTRFS_I(inode));
6427         d_instantiate_new(dentry, inode);
6428
6429 out_unlock:
6430         btrfs_end_transaction(trans);
6431         btrfs_btree_balance_dirty(fs_info);
6432         if (err && inode) {
6433                 inode_dec_link_count(inode);
6434                 discard_new_inode(inode);
6435         }
6436         return err;
6437 }
6438
6439 static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
6440                         struct dentry *dentry, umode_t mode, bool excl)
6441 {
6442         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6443         struct btrfs_trans_handle *trans;
6444         struct btrfs_root *root = BTRFS_I(dir)->root;
6445         struct inode *inode = NULL;
6446         int err;
6447         u64 objectid;
6448         u64 index = 0;
6449
6450         /*
6451          * 2 for inode item and ref
6452          * 2 for dir items
6453          * 1 for xattr if selinux is on
6454          */
6455         trans = btrfs_start_transaction(root, 5);
6456         if (IS_ERR(trans))
6457                 return PTR_ERR(trans);
6458
6459         err = btrfs_get_free_objectid(root, &objectid);
6460         if (err)
6461                 goto out_unlock;
6462
6463         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6464                         dentry->d_name.name, dentry->d_name.len,
6465                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
6466         if (IS_ERR(inode)) {
6467                 err = PTR_ERR(inode);
6468                 inode = NULL;
6469                 goto out_unlock;
6470         }
6471         /*
6472         * If the active LSM wants to access the inode during
6473         * d_instantiate it needs these. Smack checks to see
6474         * if the filesystem supports xattrs by looking at the
6475         * ops vector.
6476         */
6477         inode->i_fop = &btrfs_file_operations;
6478         inode->i_op = &btrfs_file_inode_operations;
6479         inode->i_mapping->a_ops = &btrfs_aops;
6480
6481         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6482         if (err)
6483                 goto out_unlock;
6484
6485         err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6486         if (err)
6487                 goto out_unlock;
6488
6489         err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6490                         0, index);
6491         if (err)
6492                 goto out_unlock;
6493
6494         d_instantiate_new(dentry, inode);
6495
6496 out_unlock:
6497         btrfs_end_transaction(trans);
6498         if (err && inode) {
6499                 inode_dec_link_count(inode);
6500                 discard_new_inode(inode);
6501         }
6502         btrfs_btree_balance_dirty(fs_info);
6503         return err;
6504 }
6505
6506 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6507                       struct dentry *dentry)
6508 {
6509         struct btrfs_trans_handle *trans = NULL;
6510         struct btrfs_root *root = BTRFS_I(dir)->root;
6511         struct inode *inode = d_inode(old_dentry);
6512         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6513         u64 index;
6514         int err;
6515         int drop_inode = 0;
6516
6517         /* do not allow sys_link's with other subvols of the same device */
6518         if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6519                 return -EXDEV;
6520
6521         if (inode->i_nlink >= BTRFS_LINK_MAX)
6522                 return -EMLINK;
6523
6524         err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6525         if (err)
6526                 goto fail;
6527
6528         /*
6529          * 2 items for inode and inode ref
6530          * 2 items for dir items
6531          * 1 item for parent inode
6532          * 1 item for orphan item deletion if O_TMPFILE
6533          */
6534         trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6535         if (IS_ERR(trans)) {
6536                 err = PTR_ERR(trans);
6537                 trans = NULL;
6538                 goto fail;
6539         }
6540
6541         /* There are several dir indexes for this inode, clear the cache. */
6542         BTRFS_I(inode)->dir_index = 0ULL;
6543         inc_nlink(inode);
6544         inode_inc_iversion(inode);
6545         inode->i_ctime = current_time(inode);
6546         ihold(inode);
6547         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6548
6549         err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6550                         1, index);
6551
6552         if (err) {
6553                 drop_inode = 1;
6554         } else {
6555                 struct dentry *parent = dentry->d_parent;
6556
6557                 err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6558                 if (err)
6559                         goto fail;
6560                 if (inode->i_nlink == 1) {
6561                         /*
6562                          * If new hard link count is 1, it's a file created
6563                          * with open(2) O_TMPFILE flag.
6564                          */
6565                         err = btrfs_orphan_del(trans, BTRFS_I(inode));
6566                         if (err)
6567                                 goto fail;
6568                 }
6569                 d_instantiate(dentry, inode);
6570                 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6571         }
6572
6573 fail:
6574         if (trans)
6575                 btrfs_end_transaction(trans);
6576         if (drop_inode) {
6577                 inode_dec_link_count(inode);
6578                 iput(inode);
6579         }
6580         btrfs_btree_balance_dirty(fs_info);
6581         return err;
6582 }
6583
6584 static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
6585                        struct dentry *dentry, umode_t mode)
6586 {
6587         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6588         struct inode *inode = NULL;
6589         struct btrfs_trans_handle *trans;
6590         struct btrfs_root *root = BTRFS_I(dir)->root;
6591         int err = 0;
6592         u64 objectid = 0;
6593         u64 index = 0;
6594
6595         /*
6596          * 2 items for inode and ref
6597          * 2 items for dir items
6598          * 1 for xattr if selinux is on
6599          */
6600         trans = btrfs_start_transaction(root, 5);
6601         if (IS_ERR(trans))
6602                 return PTR_ERR(trans);
6603
6604         err = btrfs_get_free_objectid(root, &objectid);
6605         if (err)
6606                 goto out_fail;
6607
6608         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6609                         dentry->d_name.name, dentry->d_name.len,
6610                         btrfs_ino(BTRFS_I(dir)), objectid,
6611                         S_IFDIR | mode, &index);
6612         if (IS_ERR(inode)) {
6613                 err = PTR_ERR(inode);
6614                 inode = NULL;
6615                 goto out_fail;
6616         }
6617
6618         /* these must be set before we unlock the inode */
6619         inode->i_op = &btrfs_dir_inode_operations;
6620         inode->i_fop = &btrfs_dir_file_operations;
6621
6622         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6623         if (err)
6624                 goto out_fail;
6625
6626         btrfs_i_size_write(BTRFS_I(inode), 0);
6627         err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6628         if (err)
6629                 goto out_fail;
6630
6631         err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6632                         dentry->d_name.name,
6633                         dentry->d_name.len, 0, index);
6634         if (err)
6635                 goto out_fail;
6636
6637         d_instantiate_new(dentry, inode);
6638
6639 out_fail:
6640         btrfs_end_transaction(trans);
6641         if (err && inode) {
6642                 inode_dec_link_count(inode);
6643                 discard_new_inode(inode);
6644         }
6645         btrfs_btree_balance_dirty(fs_info);
6646         return err;
6647 }
6648
6649 static noinline int uncompress_inline(struct btrfs_path *path,
6650                                       struct page *page,
6651                                       size_t pg_offset, u64 extent_offset,
6652                                       struct btrfs_file_extent_item *item)
6653 {
6654         int ret;
6655         struct extent_buffer *leaf = path->nodes[0];
6656         char *tmp;
6657         size_t max_size;
6658         unsigned long inline_size;
6659         unsigned long ptr;
6660         int compress_type;
6661
6662         WARN_ON(pg_offset != 0);
6663         compress_type = btrfs_file_extent_compression(leaf, item);
6664         max_size = btrfs_file_extent_ram_bytes(leaf, item);
6665         inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6666         tmp = kmalloc(inline_size, GFP_NOFS);
6667         if (!tmp)
6668                 return -ENOMEM;
6669         ptr = btrfs_file_extent_inline_start(item);
6670
6671         read_extent_buffer(leaf, tmp, ptr, inline_size);
6672
6673         max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6674         ret = btrfs_decompress(compress_type, tmp, page,
6675                                extent_offset, inline_size, max_size);
6676
6677         /*
6678          * decompression code contains a memset to fill in any space between the end
6679          * of the uncompressed data and the end of max_size in case the decompressed
6680          * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6681          * the end of an inline extent and the beginning of the next block, so we
6682          * cover that region here.
6683          */
6684
6685         if (max_size + pg_offset < PAGE_SIZE)
6686                 memzero_page(page,  pg_offset + max_size,
6687                              PAGE_SIZE - max_size - pg_offset);
6688         kfree(tmp);
6689         return ret;
6690 }
6691
6692 /**
6693  * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
6694  * @inode:      file to search in
6695  * @page:       page to read extent data into if the extent is inline
6696  * @pg_offset:  offset into @page to copy to
6697  * @start:      file offset
6698  * @len:        length of range starting at @start
6699  *
6700  * This returns the first &struct extent_map which overlaps with the given
6701  * range, reading it from the B-tree and caching it if necessary. Note that
6702  * there may be more extents which overlap the given range after the returned
6703  * extent_map.
6704  *
6705  * If @page is not NULL and the extent is inline, this also reads the extent
6706  * data directly into the page and marks the extent up to date in the io_tree.
6707  *
6708  * Return: ERR_PTR on error, non-NULL extent_map on success.
6709  */
6710 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6711                                     struct page *page, size_t pg_offset,
6712                                     u64 start, u64 len)
6713 {
6714         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6715         int ret = 0;
6716         u64 extent_start = 0;
6717         u64 extent_end = 0;
6718         u64 objectid = btrfs_ino(inode);
6719         int extent_type = -1;
6720         struct btrfs_path *path = NULL;
6721         struct btrfs_root *root = inode->root;
6722         struct btrfs_file_extent_item *item;
6723         struct extent_buffer *leaf;
6724         struct btrfs_key found_key;
6725         struct extent_map *em = NULL;
6726         struct extent_map_tree *em_tree = &inode->extent_tree;
6727         struct extent_io_tree *io_tree = &inode->io_tree;
6728
6729         read_lock(&em_tree->lock);
6730         em = lookup_extent_mapping(em_tree, start, len);
6731         read_unlock(&em_tree->lock);
6732
6733         if (em) {
6734                 if (em->start > start || em->start + em->len <= start)
6735                         free_extent_map(em);
6736                 else if (em->block_start == EXTENT_MAP_INLINE && page)
6737                         free_extent_map(em);
6738                 else
6739                         goto out;
6740         }
6741         em = alloc_extent_map();
6742         if (!em) {
6743                 ret = -ENOMEM;
6744                 goto out;
6745         }
6746         em->start = EXTENT_MAP_HOLE;
6747         em->orig_start = EXTENT_MAP_HOLE;
6748         em->len = (u64)-1;
6749         em->block_len = (u64)-1;
6750
6751         path = btrfs_alloc_path();
6752         if (!path) {
6753                 ret = -ENOMEM;
6754                 goto out;
6755         }
6756
6757         /* Chances are we'll be called again, so go ahead and do readahead */
6758         path->reada = READA_FORWARD;
6759
6760         /*
6761          * The same explanation in load_free_space_cache applies here as well,
6762          * we only read when we're loading the free space cache, and at that
6763          * point the commit_root has everything we need.
6764          */
6765         if (btrfs_is_free_space_inode(inode)) {
6766                 path->search_commit_root = 1;
6767                 path->skip_locking = 1;
6768         }
6769
6770         ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6771         if (ret < 0) {
6772                 goto out;
6773         } else if (ret > 0) {
6774                 if (path->slots[0] == 0)
6775                         goto not_found;
6776                 path->slots[0]--;
6777                 ret = 0;
6778         }
6779
6780         leaf = path->nodes[0];
6781         item = btrfs_item_ptr(leaf, path->slots[0],
6782                               struct btrfs_file_extent_item);
6783         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6784         if (found_key.objectid != objectid ||
6785             found_key.type != BTRFS_EXTENT_DATA_KEY) {
6786                 /*
6787                  * If we backup past the first extent we want to move forward
6788                  * and see if there is an extent in front of us, otherwise we'll
6789                  * say there is a hole for our whole search range which can
6790                  * cause problems.
6791                  */
6792                 extent_end = start;
6793                 goto next;
6794         }
6795
6796         extent_type = btrfs_file_extent_type(leaf, item);
6797         extent_start = found_key.offset;
6798         extent_end = btrfs_file_extent_end(path);
6799         if (extent_type == BTRFS_FILE_EXTENT_REG ||
6800             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6801                 /* Only regular file could have regular/prealloc extent */
6802                 if (!S_ISREG(inode->vfs_inode.i_mode)) {
6803                         ret = -EUCLEAN;
6804                         btrfs_crit(fs_info,
6805                 "regular/prealloc extent found for non-regular inode %llu",
6806                                    btrfs_ino(inode));
6807                         goto out;
6808                 }
6809                 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6810                                                        extent_start);
6811         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6812                 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6813                                                       path->slots[0],
6814                                                       extent_start);
6815         }
6816 next:
6817         if (start >= extent_end) {
6818                 path->slots[0]++;
6819                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6820                         ret = btrfs_next_leaf(root, path);
6821                         if (ret < 0)
6822                                 goto out;
6823                         else if (ret > 0)
6824                                 goto not_found;
6825
6826                         leaf = path->nodes[0];
6827                 }
6828                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6829                 if (found_key.objectid != objectid ||
6830                     found_key.type != BTRFS_EXTENT_DATA_KEY)
6831                         goto not_found;
6832                 if (start + len <= found_key.offset)
6833                         goto not_found;
6834                 if (start > found_key.offset)
6835                         goto next;
6836
6837                 /* New extent overlaps with existing one */
6838                 em->start = start;
6839                 em->orig_start = start;
6840                 em->len = found_key.offset - start;
6841                 em->block_start = EXTENT_MAP_HOLE;
6842                 goto insert;
6843         }
6844
6845         btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
6846
6847         if (extent_type == BTRFS_FILE_EXTENT_REG ||
6848             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6849                 goto insert;
6850         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6851                 unsigned long ptr;
6852                 char *map;
6853                 size_t size;
6854                 size_t extent_offset;
6855                 size_t copy_size;
6856
6857                 if (!page)
6858                         goto out;
6859
6860                 size = btrfs_file_extent_ram_bytes(leaf, item);
6861                 extent_offset = page_offset(page) + pg_offset - extent_start;
6862                 copy_size = min_t(u64, PAGE_SIZE - pg_offset,
6863                                   size - extent_offset);
6864                 em->start = extent_start + extent_offset;
6865                 em->len = ALIGN(copy_size, fs_info->sectorsize);
6866                 em->orig_block_len = em->len;
6867                 em->orig_start = em->start;
6868                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6869
6870                 if (!PageUptodate(page)) {
6871                         if (btrfs_file_extent_compression(leaf, item) !=
6872                             BTRFS_COMPRESS_NONE) {
6873                                 ret = uncompress_inline(path, page, pg_offset,
6874                                                         extent_offset, item);
6875                                 if (ret)
6876                                         goto out;
6877                         } else {
6878                                 map = kmap_local_page(page);
6879                                 read_extent_buffer(leaf, map + pg_offset, ptr,
6880                                                    copy_size);
6881                                 if (pg_offset + copy_size < PAGE_SIZE) {
6882                                         memset(map + pg_offset + copy_size, 0,
6883                                                PAGE_SIZE - pg_offset -
6884                                                copy_size);
6885                                 }
6886                                 kunmap_local(map);
6887                         }
6888                         flush_dcache_page(page);
6889                 }
6890                 set_extent_uptodate(io_tree, em->start,
6891                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
6892                 goto insert;
6893         }
6894 not_found:
6895         em->start = start;
6896         em->orig_start = start;
6897         em->len = len;
6898         em->block_start = EXTENT_MAP_HOLE;
6899 insert:
6900         ret = 0;
6901         btrfs_release_path(path);
6902         if (em->start > start || extent_map_end(em) <= start) {
6903                 btrfs_err(fs_info,
6904                           "bad extent! em: [%llu %llu] passed [%llu %llu]",
6905                           em->start, em->len, start, len);
6906                 ret = -EIO;
6907                 goto out;
6908         }
6909
6910         write_lock(&em_tree->lock);
6911         ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6912         write_unlock(&em_tree->lock);
6913 out:
6914         btrfs_free_path(path);
6915
6916         trace_btrfs_get_extent(root, inode, em);
6917
6918         if (ret) {
6919                 free_extent_map(em);
6920                 return ERR_PTR(ret);
6921         }
6922         return em;
6923 }
6924
6925 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
6926                                            u64 start, u64 len)
6927 {
6928         struct extent_map *em;
6929         struct extent_map *hole_em = NULL;
6930         u64 delalloc_start = start;
6931         u64 end;
6932         u64 delalloc_len;
6933         u64 delalloc_end;
6934         int err = 0;
6935
6936         em = btrfs_get_extent(inode, NULL, 0, start, len);
6937         if (IS_ERR(em))
6938                 return em;
6939         /*
6940          * If our em maps to:
6941          * - a hole or
6942          * - a pre-alloc extent,
6943          * there might actually be delalloc bytes behind it.
6944          */
6945         if (em->block_start != EXTENT_MAP_HOLE &&
6946             !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6947                 return em;
6948         else
6949                 hole_em = em;
6950
6951         /* check to see if we've wrapped (len == -1 or similar) */
6952         end = start + len;
6953         if (end < start)
6954                 end = (u64)-1;
6955         else
6956                 end -= 1;
6957
6958         em = NULL;
6959
6960         /* ok, we didn't find anything, lets look for delalloc */
6961         delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
6962                                  end, len, EXTENT_DELALLOC, 1);
6963         delalloc_end = delalloc_start + delalloc_len;
6964         if (delalloc_end < delalloc_start)
6965                 delalloc_end = (u64)-1;
6966
6967         /*
6968          * We didn't find anything useful, return the original results from
6969          * get_extent()
6970          */
6971         if (delalloc_start > end || delalloc_end <= start) {
6972                 em = hole_em;
6973                 hole_em = NULL;
6974                 goto out;
6975         }
6976
6977         /*
6978          * Adjust the delalloc_start to make sure it doesn't go backwards from
6979          * the start they passed in
6980          */
6981         delalloc_start = max(start, delalloc_start);
6982         delalloc_len = delalloc_end - delalloc_start;
6983
6984         if (delalloc_len > 0) {
6985                 u64 hole_start;
6986                 u64 hole_len;
6987                 const u64 hole_end = extent_map_end(hole_em);
6988
6989                 em = alloc_extent_map();
6990                 if (!em) {
6991                         err = -ENOMEM;
6992                         goto out;
6993                 }
6994
6995                 ASSERT(hole_em);
6996                 /*
6997                  * When btrfs_get_extent can't find anything it returns one
6998                  * huge hole
6999                  *
7000                  * Make sure what it found really fits our range, and adjust to
7001                  * make sure it is based on the start from the caller
7002                  */
7003                 if (hole_end <= start || hole_em->start > end) {
7004                        free_extent_map(hole_em);
7005                        hole_em = NULL;
7006                 } else {
7007                        hole_start = max(hole_em->start, start);
7008                        hole_len = hole_end - hole_start;
7009                 }
7010
7011                 if (hole_em && delalloc_start > hole_start) {
7012                         /*
7013                          * Our hole starts before our delalloc, so we have to
7014                          * return just the parts of the hole that go until the
7015                          * delalloc starts
7016                          */
7017                         em->len = min(hole_len, delalloc_start - hole_start);
7018                         em->start = hole_start;
7019                         em->orig_start = hole_start;
7020                         /*
7021                          * Don't adjust block start at all, it is fixed at
7022                          * EXTENT_MAP_HOLE
7023                          */
7024                         em->block_start = hole_em->block_start;
7025                         em->block_len = hole_len;
7026                         if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7027                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7028                 } else {
7029                         /*
7030                          * Hole is out of passed range or it starts after
7031                          * delalloc range
7032                          */
7033                         em->start = delalloc_start;
7034                         em->len = delalloc_len;
7035                         em->orig_start = delalloc_start;
7036                         em->block_start = EXTENT_MAP_DELALLOC;
7037                         em->block_len = delalloc_len;
7038                 }
7039         } else {
7040                 return hole_em;
7041         }
7042 out:
7043
7044         free_extent_map(hole_em);
7045         if (err) {
7046                 free_extent_map(em);
7047                 return ERR_PTR(err);
7048         }
7049         return em;
7050 }
7051
7052 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
7053                                                   const u64 start,
7054                                                   const u64 len,
7055                                                   const u64 orig_start,
7056                                                   const u64 block_start,
7057                                                   const u64 block_len,
7058                                                   const u64 orig_block_len,
7059                                                   const u64 ram_bytes,
7060                                                   const int type)
7061 {
7062         struct extent_map *em = NULL;
7063         int ret;
7064
7065         if (type != BTRFS_ORDERED_NOCOW) {
7066                 em = create_io_em(inode, start, len, orig_start, block_start,
7067                                   block_len, orig_block_len, ram_bytes,
7068                                   BTRFS_COMPRESS_NONE, /* compress_type */
7069                                   type);
7070                 if (IS_ERR(em))
7071                         goto out;
7072         }
7073         ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
7074                                        block_len, 0,
7075                                        (1 << type) |
7076                                        (1 << BTRFS_ORDERED_DIRECT),
7077                                        BTRFS_COMPRESS_NONE);
7078         if (ret) {
7079                 if (em) {
7080                         free_extent_map(em);
7081                         btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
7082                 }
7083                 em = ERR_PTR(ret);
7084         }
7085  out:
7086
7087         return em;
7088 }
7089
7090 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
7091                                                   u64 start, u64 len)
7092 {
7093         struct btrfs_root *root = inode->root;
7094         struct btrfs_fs_info *fs_info = root->fs_info;
7095         struct extent_map *em;
7096         struct btrfs_key ins;
7097         u64 alloc_hint;
7098         int ret;
7099
7100         alloc_hint = get_extent_allocation_hint(inode, start, len);
7101         ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7102                                    0, alloc_hint, &ins, 1, 1);
7103         if (ret)
7104                 return ERR_PTR(ret);
7105
7106         em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7107                                      ins.objectid, ins.offset, ins.offset,
7108                                      ins.offset, BTRFS_ORDERED_REGULAR);
7109         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7110         if (IS_ERR(em))
7111                 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7112                                            1);
7113
7114         return em;
7115 }
7116
7117 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7118 {
7119         struct btrfs_block_group *block_group;
7120         bool readonly = false;
7121
7122         block_group = btrfs_lookup_block_group(fs_info, bytenr);
7123         if (!block_group || block_group->ro)
7124                 readonly = true;
7125         if (block_group)
7126                 btrfs_put_block_group(block_group);
7127         return readonly;
7128 }
7129
7130 /*
7131  * Check if we can do nocow write into the range [@offset, @offset + @len)
7132  *
7133  * @offset:     File offset
7134  * @len:        The length to write, will be updated to the nocow writeable
7135  *              range
7136  * @orig_start: (optional) Return the original file offset of the file extent
7137  * @orig_len:   (optional) Return the original on-disk length of the file extent
7138  * @ram_bytes:  (optional) Return the ram_bytes of the file extent
7139  * @strict:     if true, omit optimizations that might force us into unnecessary
7140  *              cow. e.g., don't trust generation number.
7141  *
7142  * Return:
7143  * >0   and update @len if we can do nocow write
7144  *  0   if we can't do nocow write
7145  * <0   if error happened
7146  *
7147  * NOTE: This only checks the file extents, caller is responsible to wait for
7148  *       any ordered extents.
7149  */
7150 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7151                               u64 *orig_start, u64 *orig_block_len,
7152                               u64 *ram_bytes, bool strict)
7153 {
7154         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7155         struct btrfs_path *path;
7156         int ret;
7157         struct extent_buffer *leaf;
7158         struct btrfs_root *root = BTRFS_I(inode)->root;
7159         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7160         struct btrfs_file_extent_item *fi;
7161         struct btrfs_key key;
7162         u64 disk_bytenr;
7163         u64 backref_offset;
7164         u64 extent_end;
7165         u64 num_bytes;
7166         int slot;
7167         int found_type;
7168         bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7169
7170         path = btrfs_alloc_path();
7171         if (!path)
7172                 return -ENOMEM;
7173
7174         ret = btrfs_lookup_file_extent(NULL, root, path,
7175                         btrfs_ino(BTRFS_I(inode)), offset, 0);
7176         if (ret < 0)
7177                 goto out;
7178
7179         slot = path->slots[0];
7180         if (ret == 1) {
7181                 if (slot == 0) {
7182                         /* can't find the item, must cow */
7183                         ret = 0;
7184                         goto out;
7185                 }
7186                 slot--;
7187         }
7188         ret = 0;
7189         leaf = path->nodes[0];
7190         btrfs_item_key_to_cpu(leaf, &key, slot);
7191         if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7192             key.type != BTRFS_EXTENT_DATA_KEY) {
7193                 /* not our file or wrong item type, must cow */
7194                 goto out;
7195         }
7196
7197         if (key.offset > offset) {
7198                 /* Wrong offset, must cow */
7199                 goto out;
7200         }
7201
7202         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7203         found_type = btrfs_file_extent_type(leaf, fi);
7204         if (found_type != BTRFS_FILE_EXTENT_REG &&
7205             found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7206                 /* not a regular extent, must cow */
7207                 goto out;
7208         }
7209
7210         if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7211                 goto out;
7212
7213         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7214         if (extent_end <= offset)
7215                 goto out;
7216
7217         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7218         if (disk_bytenr == 0)
7219                 goto out;
7220
7221         if (btrfs_file_extent_compression(leaf, fi) ||
7222             btrfs_file_extent_encryption(leaf, fi) ||
7223             btrfs_file_extent_other_encoding(leaf, fi))
7224                 goto out;
7225
7226         /*
7227          * Do the same check as in btrfs_cross_ref_exist but without the
7228          * unnecessary search.
7229          */
7230         if (!strict &&
7231             (btrfs_file_extent_generation(leaf, fi) <=
7232              btrfs_root_last_snapshot(&root->root_item)))
7233                 goto out;
7234
7235         backref_offset = btrfs_file_extent_offset(leaf, fi);
7236
7237         if (orig_start) {
7238                 *orig_start = key.offset - backref_offset;
7239                 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7240                 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7241         }
7242
7243         if (btrfs_extent_readonly(fs_info, disk_bytenr))
7244                 goto out;
7245
7246         num_bytes = min(offset + *len, extent_end) - offset;
7247         if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7248                 u64 range_end;
7249
7250                 range_end = round_up(offset + num_bytes,
7251                                      root->fs_info->sectorsize) - 1;
7252                 ret = test_range_bit(io_tree, offset, range_end,
7253                                      EXTENT_DELALLOC, 0, NULL);
7254                 if (ret) {
7255                         ret = -EAGAIN;
7256                         goto out;
7257                 }
7258         }
7259
7260         btrfs_release_path(path);
7261
7262         /*
7263          * look for other files referencing this extent, if we
7264          * find any we must cow
7265          */
7266
7267         ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7268                                     key.offset - backref_offset, disk_bytenr,
7269                                     strict);
7270         if (ret) {
7271                 ret = 0;
7272                 goto out;
7273         }
7274
7275         /*
7276          * adjust disk_bytenr and num_bytes to cover just the bytes
7277          * in this extent we are about to write.  If there
7278          * are any csums in that range we have to cow in order
7279          * to keep the csums correct
7280          */
7281         disk_bytenr += backref_offset;
7282         disk_bytenr += offset - key.offset;
7283         if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
7284                 goto out;
7285         /*
7286          * all of the above have passed, it is safe to overwrite this extent
7287          * without cow
7288          */
7289         *len = num_bytes;
7290         ret = 1;
7291 out:
7292         btrfs_free_path(path);
7293         return ret;
7294 }
7295
7296 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7297                               struct extent_state **cached_state, bool writing)
7298 {
7299         struct btrfs_ordered_extent *ordered;
7300         int ret = 0;
7301
7302         while (1) {
7303                 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7304                                  cached_state);
7305                 /*
7306                  * We're concerned with the entire range that we're going to be
7307                  * doing DIO to, so we need to make sure there's no ordered
7308                  * extents in this range.
7309                  */
7310                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7311                                                      lockend - lockstart + 1);
7312
7313                 /*
7314                  * We need to make sure there are no buffered pages in this
7315                  * range either, we could have raced between the invalidate in
7316                  * generic_file_direct_write and locking the extent.  The
7317                  * invalidate needs to happen so that reads after a write do not
7318                  * get stale data.
7319                  */
7320                 if (!ordered &&
7321                     (!writing || !filemap_range_has_page(inode->i_mapping,
7322                                                          lockstart, lockend)))
7323                         break;
7324
7325                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7326                                      cached_state);
7327
7328                 if (ordered) {
7329                         /*
7330                          * If we are doing a DIO read and the ordered extent we
7331                          * found is for a buffered write, we can not wait for it
7332                          * to complete and retry, because if we do so we can
7333                          * deadlock with concurrent buffered writes on page
7334                          * locks. This happens only if our DIO read covers more
7335                          * than one extent map, if at this point has already
7336                          * created an ordered extent for a previous extent map
7337                          * and locked its range in the inode's io tree, and a
7338                          * concurrent write against that previous extent map's
7339                          * range and this range started (we unlock the ranges
7340                          * in the io tree only when the bios complete and
7341                          * buffered writes always lock pages before attempting
7342                          * to lock range in the io tree).
7343                          */
7344                         if (writing ||
7345                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7346                                 btrfs_start_ordered_extent(ordered, 1);
7347                         else
7348                                 ret = -ENOTBLK;
7349                         btrfs_put_ordered_extent(ordered);
7350                 } else {
7351                         /*
7352                          * We could trigger writeback for this range (and wait
7353                          * for it to complete) and then invalidate the pages for
7354                          * this range (through invalidate_inode_pages2_range()),
7355                          * but that can lead us to a deadlock with a concurrent
7356                          * call to readahead (a buffered read or a defrag call
7357                          * triggered a readahead) on a page lock due to an
7358                          * ordered dio extent we created before but did not have
7359                          * yet a corresponding bio submitted (whence it can not
7360                          * complete), which makes readahead wait for that
7361                          * ordered extent to complete while holding a lock on
7362                          * that page.
7363                          */
7364                         ret = -ENOTBLK;
7365                 }
7366
7367                 if (ret)
7368                         break;
7369
7370                 cond_resched();
7371         }
7372
7373         return ret;
7374 }
7375
7376 /* The callers of this must take lock_extent() */
7377 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7378                                        u64 len, u64 orig_start, u64 block_start,
7379                                        u64 block_len, u64 orig_block_len,
7380                                        u64 ram_bytes, int compress_type,
7381                                        int type)
7382 {
7383         struct extent_map_tree *em_tree;
7384         struct extent_map *em;
7385         int ret;
7386
7387         ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7388                type == BTRFS_ORDERED_COMPRESSED ||
7389                type == BTRFS_ORDERED_NOCOW ||
7390                type == BTRFS_ORDERED_REGULAR);
7391
7392         em_tree = &inode->extent_tree;
7393         em = alloc_extent_map();
7394         if (!em)
7395                 return ERR_PTR(-ENOMEM);
7396
7397         em->start = start;
7398         em->orig_start = orig_start;
7399         em->len = len;
7400         em->block_len = block_len;
7401         em->block_start = block_start;
7402         em->orig_block_len = orig_block_len;
7403         em->ram_bytes = ram_bytes;
7404         em->generation = -1;
7405         set_bit(EXTENT_FLAG_PINNED, &em->flags);
7406         if (type == BTRFS_ORDERED_PREALLOC) {
7407                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
7408         } else if (type == BTRFS_ORDERED_COMPRESSED) {
7409                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7410                 em->compress_type = compress_type;
7411         }
7412
7413         do {
7414                 btrfs_drop_extent_cache(inode, em->start,
7415                                         em->start + em->len - 1, 0);
7416                 write_lock(&em_tree->lock);
7417                 ret = add_extent_mapping(em_tree, em, 1);
7418                 write_unlock(&em_tree->lock);
7419                 /*
7420                  * The caller has taken lock_extent(), who could race with us
7421                  * to add em?
7422                  */
7423         } while (ret == -EEXIST);
7424
7425         if (ret) {
7426                 free_extent_map(em);
7427                 return ERR_PTR(ret);
7428         }
7429
7430         /* em got 2 refs now, callers needs to do free_extent_map once. */
7431         return em;
7432 }
7433
7434
7435 static int btrfs_get_blocks_direct_write(struct extent_map **map,
7436                                          struct inode *inode,
7437                                          struct btrfs_dio_data *dio_data,
7438                                          u64 start, u64 len)
7439 {
7440         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7441         struct extent_map *em = *map;
7442         int type;
7443         u64 block_start, orig_start, orig_block_len, ram_bytes;
7444         bool can_nocow = false;
7445         bool space_reserved = false;
7446         u64 prev_len;
7447         int ret = 0;
7448
7449         /*
7450          * We don't allocate a new extent in the following cases
7451          *
7452          * 1) The inode is marked as NODATACOW. In this case we'll just use the
7453          * existing extent.
7454          * 2) The extent is marked as PREALLOC. We're good to go here and can
7455          * just use the extent.
7456          *
7457          */
7458         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7459             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7460              em->block_start != EXTENT_MAP_HOLE)) {
7461                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7462                         type = BTRFS_ORDERED_PREALLOC;
7463                 else
7464                         type = BTRFS_ORDERED_NOCOW;
7465                 len = min(len, em->len - (start - em->start));
7466                 block_start = em->block_start + (start - em->start);
7467
7468                 if (can_nocow_extent(inode, start, &len, &orig_start,
7469                                      &orig_block_len, &ram_bytes, false) == 1 &&
7470                     btrfs_inc_nocow_writers(fs_info, block_start))
7471                         can_nocow = true;
7472         }
7473
7474         prev_len = len;
7475         if (can_nocow) {
7476                 struct extent_map *em2;
7477
7478                 /* We can NOCOW, so only need to reserve metadata space. */
7479                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
7480                 if (ret < 0) {
7481                         /* Our caller expects us to free the input extent map. */
7482                         free_extent_map(em);
7483                         *map = NULL;
7484                         btrfs_dec_nocow_writers(fs_info, block_start);
7485                         goto out;
7486                 }
7487                 space_reserved = true;
7488
7489                 em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
7490                                               orig_start, block_start,
7491                                               len, orig_block_len,
7492                                               ram_bytes, type);
7493                 btrfs_dec_nocow_writers(fs_info, block_start);
7494                 if (type == BTRFS_ORDERED_PREALLOC) {
7495                         free_extent_map(em);
7496                         *map = em = em2;
7497                 }
7498
7499                 if (IS_ERR(em2)) {
7500                         ret = PTR_ERR(em2);
7501                         goto out;
7502                 }
7503         } else {
7504                 /* Our caller expects us to free the input extent map. */
7505                 free_extent_map(em);
7506                 *map = NULL;
7507
7508                 /* We have to COW, so need to reserve metadata and data space. */
7509                 ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
7510                                                    &dio_data->data_reserved,
7511                                                    start, len);
7512                 if (ret < 0)
7513                         goto out;
7514                 space_reserved = true;
7515
7516                 em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
7517                 if (IS_ERR(em)) {
7518                         ret = PTR_ERR(em);
7519                         goto out;
7520                 }
7521                 *map = em;
7522                 len = min(len, em->len - (start - em->start));
7523                 if (len < prev_len)
7524                         btrfs_delalloc_release_space(BTRFS_I(inode),
7525                                                      dio_data->data_reserved,
7526                                                      start + len, prev_len - len,
7527                                                      true);
7528         }
7529
7530         /*
7531          * We have created our ordered extent, so we can now release our reservation
7532          * for an outstanding extent.
7533          */
7534         btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
7535
7536         /*
7537          * Need to update the i_size under the extent lock so buffered
7538          * readers will get the updated i_size when we unlock.
7539          */
7540         if (start + len > i_size_read(inode))
7541                 i_size_write(inode, start + len);
7542 out:
7543         if (ret && space_reserved) {
7544                 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7545                 if (can_nocow) {
7546                         btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
7547                 } else {
7548                         btrfs_delalloc_release_space(BTRFS_I(inode),
7549                                                      dio_data->data_reserved,
7550                                                      start, len, true);
7551                         extent_changeset_free(dio_data->data_reserved);
7552                         dio_data->data_reserved = NULL;
7553                 }
7554         }
7555         return ret;
7556 }
7557
7558 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7559                 loff_t length, unsigned int flags, struct iomap *iomap,
7560                 struct iomap *srcmap)
7561 {
7562         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7563         struct extent_map *em;
7564         struct extent_state *cached_state = NULL;
7565         struct btrfs_dio_data *dio_data = NULL;
7566         u64 lockstart, lockend;
7567         const bool write = !!(flags & IOMAP_WRITE);
7568         int ret = 0;
7569         u64 len = length;
7570         bool unlock_extents = false;
7571
7572         if (!write)
7573                 len = min_t(u64, len, fs_info->sectorsize);
7574
7575         lockstart = start;
7576         lockend = start + len - 1;
7577
7578         /*
7579          * The generic stuff only does filemap_write_and_wait_range, which
7580          * isn't enough if we've written compressed pages to this area, so we
7581          * need to flush the dirty pages again to make absolutely sure that any
7582          * outstanding dirty pages are on disk.
7583          */
7584         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7585                      &BTRFS_I(inode)->runtime_flags)) {
7586                 ret = filemap_fdatawrite_range(inode->i_mapping, start,
7587                                                start + length - 1);
7588                 if (ret)
7589                         return ret;
7590         }
7591
7592         dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
7593         if (!dio_data)
7594                 return -ENOMEM;
7595
7596         iomap->private = dio_data;
7597
7598
7599         /*
7600          * If this errors out it's because we couldn't invalidate pagecache for
7601          * this range and we need to fallback to buffered.
7602          */
7603         if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
7604                 ret = -ENOTBLK;
7605                 goto err;
7606         }
7607
7608         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7609         if (IS_ERR(em)) {
7610                 ret = PTR_ERR(em);
7611                 goto unlock_err;
7612         }
7613
7614         /*
7615          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7616          * io.  INLINE is special, and we could probably kludge it in here, but
7617          * it's still buffered so for safety lets just fall back to the generic
7618          * buffered path.
7619          *
7620          * For COMPRESSED we _have_ to read the entire extent in so we can
7621          * decompress it, so there will be buffering required no matter what we
7622          * do, so go ahead and fallback to buffered.
7623          *
7624          * We return -ENOTBLK because that's what makes DIO go ahead and go back
7625          * to buffered IO.  Don't blame me, this is the price we pay for using
7626          * the generic code.
7627          */
7628         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7629             em->block_start == EXTENT_MAP_INLINE) {
7630                 free_extent_map(em);
7631                 ret = -ENOTBLK;
7632                 goto unlock_err;
7633         }
7634
7635         len = min(len, em->len - (start - em->start));
7636
7637         /*
7638          * If we have a NOWAIT request and the range contains multiple extents
7639          * (or a mix of extents and holes), then we return -EAGAIN to make the
7640          * caller fallback to a context where it can do a blocking (without
7641          * NOWAIT) request. This way we avoid doing partial IO and returning
7642          * success to the caller, which is not optimal for writes and for reads
7643          * it can result in unexpected behaviour for an application.
7644          *
7645          * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7646          * iomap_dio_rw(), we can end up returning less data then what the caller
7647          * asked for, resulting in an unexpected, and incorrect, short read.
7648          * That is, the caller asked to read N bytes and we return less than that,
7649          * which is wrong unless we are crossing EOF. This happens if we get a
7650          * page fault error when trying to fault in pages for the buffer that is
7651          * associated to the struct iov_iter passed to iomap_dio_rw(), and we
7652          * have previously submitted bios for other extents in the range, in
7653          * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7654          * those bios have completed by the time we get the page fault error,
7655          * which we return back to our caller - we should only return EIOCBQUEUED
7656          * after we have submitted bios for all the extents in the range.
7657          */
7658         if ((flags & IOMAP_NOWAIT) && len < length) {
7659                 free_extent_map(em);
7660                 ret = -EAGAIN;
7661                 goto unlock_err;
7662         }
7663
7664         if (write) {
7665                 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7666                                                     start, len);
7667                 if (ret < 0)
7668                         goto unlock_err;
7669                 unlock_extents = true;
7670                 /* Recalc len in case the new em is smaller than requested */
7671                 len = min(len, em->len - (start - em->start));
7672         } else {
7673                 /*
7674                  * We need to unlock only the end area that we aren't using.
7675                  * The rest is going to be unlocked by the endio routine.
7676                  */
7677                 lockstart = start + len;
7678                 if (lockstart < lockend)
7679                         unlock_extents = true;
7680         }
7681
7682         if (unlock_extents)
7683                 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7684                                      lockstart, lockend, &cached_state);
7685         else
7686                 free_extent_state(cached_state);
7687
7688         /*
7689          * Translate extent map information to iomap.
7690          * We trim the extents (and move the addr) even though iomap code does
7691          * that, since we have locked only the parts we are performing I/O in.
7692          */
7693         if ((em->block_start == EXTENT_MAP_HOLE) ||
7694             (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7695                 iomap->addr = IOMAP_NULL_ADDR;
7696                 iomap->type = IOMAP_HOLE;
7697         } else {
7698                 iomap->addr = em->block_start + (start - em->start);
7699                 iomap->type = IOMAP_MAPPED;
7700         }
7701         iomap->offset = start;
7702         iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7703         iomap->length = len;
7704
7705         if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
7706                 iomap->flags |= IOMAP_F_ZONE_APPEND;
7707
7708         free_extent_map(em);
7709
7710         return 0;
7711
7712 unlock_err:
7713         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7714                              &cached_state);
7715 err:
7716         kfree(dio_data);
7717
7718         return ret;
7719 }
7720
7721 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7722                 ssize_t written, unsigned int flags, struct iomap *iomap)
7723 {
7724         int ret = 0;
7725         struct btrfs_dio_data *dio_data = iomap->private;
7726         size_t submitted = dio_data->submitted;
7727         const bool write = !!(flags & IOMAP_WRITE);
7728
7729         if (!write && (iomap->type == IOMAP_HOLE)) {
7730                 /* If reading from a hole, unlock and return */
7731                 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
7732                 goto out;
7733         }
7734
7735         if (submitted < length) {
7736                 pos += submitted;
7737                 length -= submitted;
7738                 if (write)
7739                         __endio_write_update_ordered(BTRFS_I(inode), pos,
7740                                         length, false);
7741                 else
7742                         unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7743                                       pos + length - 1);
7744                 ret = -ENOTBLK;
7745         }
7746
7747         if (write)
7748                 extent_changeset_free(dio_data->data_reserved);
7749 out:
7750         kfree(dio_data);
7751         iomap->private = NULL;
7752
7753         return ret;
7754 }
7755
7756 static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
7757 {
7758         /*
7759          * This implies a barrier so that stores to dio_bio->bi_status before
7760          * this and loads of dio_bio->bi_status after this are fully ordered.
7761          */
7762         if (!refcount_dec_and_test(&dip->refs))
7763                 return;
7764
7765         if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
7766                 __endio_write_update_ordered(BTRFS_I(dip->inode),
7767                                              dip->file_offset,
7768                                              dip->bytes,
7769                                              !dip->dio_bio->bi_status);
7770         } else {
7771                 unlock_extent(&BTRFS_I(dip->inode)->io_tree,
7772                               dip->file_offset,
7773                               dip->file_offset + dip->bytes - 1);
7774         }
7775
7776         bio_endio(dip->dio_bio);
7777         kfree(dip);
7778 }
7779
7780 static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7781                                           int mirror_num,
7782                                           unsigned long bio_flags)
7783 {
7784         struct btrfs_dio_private *dip = bio->bi_private;
7785         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7786         blk_status_t ret;
7787
7788         BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7789
7790         ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
7791         if (ret)
7792                 return ret;
7793
7794         refcount_inc(&dip->refs);
7795         ret = btrfs_map_bio(fs_info, bio, mirror_num);
7796         if (ret)
7797                 refcount_dec(&dip->refs);
7798         return ret;
7799 }
7800
7801 static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
7802                                              struct btrfs_bio *bbio,
7803                                              const bool uptodate)
7804 {
7805         struct inode *inode = dip->inode;
7806         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
7807         const u32 sectorsize = fs_info->sectorsize;
7808         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7809         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7810         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
7811         struct bio_vec bvec;
7812         struct bvec_iter iter;
7813         const u64 orig_file_offset = dip->file_offset;
7814         u64 start = orig_file_offset;
7815         u32 bio_offset = 0;
7816         blk_status_t err = BLK_STS_OK;
7817
7818         __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
7819                 unsigned int i, nr_sectors, pgoff;
7820
7821                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
7822                 pgoff = bvec.bv_offset;
7823                 for (i = 0; i < nr_sectors; i++) {
7824                         ASSERT(pgoff < PAGE_SIZE);
7825                         if (uptodate &&
7826                             (!csum || !check_data_csum(inode, bbio,
7827                                                        bio_offset, bvec.bv_page,
7828                                                        pgoff, start))) {
7829                                 clean_io_failure(fs_info, failure_tree, io_tree,
7830                                                  start, bvec.bv_page,
7831                                                  btrfs_ino(BTRFS_I(inode)),
7832                                                  pgoff);
7833                         } else {
7834                                 int ret;
7835
7836                                 ASSERT((start - orig_file_offset) < UINT_MAX);
7837                                 ret = btrfs_repair_one_sector(inode,
7838                                                 &bbio->bio,
7839                                                 start - orig_file_offset,
7840                                                 bvec.bv_page, pgoff,
7841                                                 start, bbio->mirror_num,
7842                                                 submit_dio_repair_bio);
7843                                 if (ret)
7844                                         err = errno_to_blk_status(ret);
7845                         }
7846                         start += sectorsize;
7847                         ASSERT(bio_offset + sectorsize > bio_offset);
7848                         bio_offset += sectorsize;
7849                         pgoff += sectorsize;
7850                 }
7851         }
7852         return err;
7853 }
7854
7855 static void __endio_write_update_ordered(struct btrfs_inode *inode,
7856                                          const u64 offset, const u64 bytes,
7857                                          const bool uptodate)
7858 {
7859         btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
7860                                        finish_ordered_fn, uptodate);
7861 }
7862
7863 static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
7864                                                      struct bio *bio,
7865                                                      u64 dio_file_offset)
7866 {
7867         return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
7868 }
7869
7870 static void btrfs_end_dio_bio(struct bio *bio)
7871 {
7872         struct btrfs_dio_private *dip = bio->bi_private;
7873         blk_status_t err = bio->bi_status;
7874
7875         if (err)
7876                 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7877                            "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
7878                            btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
7879                            bio->bi_opf, bio->bi_iter.bi_sector,
7880                            bio->bi_iter.bi_size, err);
7881
7882         if (bio_op(bio) == REQ_OP_READ)
7883                 err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
7884
7885         if (err)
7886                 dip->dio_bio->bi_status = err;
7887
7888         btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
7889
7890         bio_put(bio);
7891         btrfs_dio_private_put(dip);
7892 }
7893
7894 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
7895                 struct inode *inode, u64 file_offset, int async_submit)
7896 {
7897         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7898         struct btrfs_dio_private *dip = bio->bi_private;
7899         bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
7900         blk_status_t ret;
7901
7902         /* Check btrfs_submit_bio_hook() for rules about async submit. */
7903         if (async_submit)
7904                 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
7905
7906         if (!write) {
7907                 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
7908                 if (ret)
7909                         goto err;
7910         }
7911
7912         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
7913                 goto map;
7914
7915         if (write && async_submit) {
7916                 ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
7917                                           btrfs_submit_bio_start_direct_io);
7918                 goto err;
7919         } else if (write) {
7920                 /*
7921                  * If we aren't doing async submit, calculate the csum of the
7922                  * bio now.
7923                  */
7924                 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
7925                 if (ret)
7926                         goto err;
7927         } else {
7928                 u64 csum_offset;
7929
7930                 csum_offset = file_offset - dip->file_offset;
7931                 csum_offset >>= fs_info->sectorsize_bits;
7932                 csum_offset *= fs_info->csum_size;
7933                 btrfs_bio(bio)->csum = dip->csums + csum_offset;
7934         }
7935 map:
7936         ret = btrfs_map_bio(fs_info, bio, 0);
7937 err:
7938         return ret;
7939 }
7940
7941 /*
7942  * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
7943  * or ordered extents whether or not we submit any bios.
7944  */
7945 static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
7946                                                           struct inode *inode,
7947                                                           loff_t file_offset)
7948 {
7949         const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
7950         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
7951         size_t dip_size;
7952         struct btrfs_dio_private *dip;
7953
7954         dip_size = sizeof(*dip);
7955         if (!write && csum) {
7956                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7957                 size_t nblocks;
7958
7959                 nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
7960                 dip_size += fs_info->csum_size * nblocks;
7961         }
7962
7963         dip = kzalloc(dip_size, GFP_NOFS);
7964         if (!dip)
7965                 return NULL;
7966
7967         dip->inode = inode;
7968         dip->file_offset = file_offset;
7969         dip->bytes = dio_bio->bi_iter.bi_size;
7970         dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
7971         dip->dio_bio = dio_bio;
7972         refcount_set(&dip->refs, 1);
7973         return dip;
7974 }
7975
7976 static void btrfs_submit_direct(const struct iomap_iter *iter,
7977                 struct bio *dio_bio, loff_t file_offset)
7978 {
7979         struct inode *inode = iter->inode;
7980         const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
7981         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7982         const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
7983                              BTRFS_BLOCK_GROUP_RAID56_MASK);
7984         struct btrfs_dio_private *dip;
7985         struct bio *bio;
7986         u64 start_sector;
7987         int async_submit = 0;
7988         u64 submit_len;
7989         u64 clone_offset = 0;
7990         u64 clone_len;
7991         u64 logical;
7992         int ret;
7993         blk_status_t status;
7994         struct btrfs_io_geometry geom;
7995         struct btrfs_dio_data *dio_data = iter->iomap.private;
7996         struct extent_map *em = NULL;
7997
7998         dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
7999         if (!dip) {
8000                 if (!write) {
8001                         unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8002                                 file_offset + dio_bio->bi_iter.bi_size - 1);
8003                 }
8004                 dio_bio->bi_status = BLK_STS_RESOURCE;
8005                 bio_endio(dio_bio);
8006                 return;
8007         }
8008
8009         if (!write) {
8010                 /*
8011                  * Load the csums up front to reduce csum tree searches and
8012                  * contention when submitting bios.
8013                  *
8014                  * If we have csums disabled this will do nothing.
8015                  */
8016                 status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
8017                 if (status != BLK_STS_OK)
8018                         goto out_err;
8019         }
8020
8021         start_sector = dio_bio->bi_iter.bi_sector;
8022         submit_len = dio_bio->bi_iter.bi_size;
8023
8024         do {
8025                 logical = start_sector << 9;
8026                 em = btrfs_get_chunk_map(fs_info, logical, submit_len);
8027                 if (IS_ERR(em)) {
8028                         status = errno_to_blk_status(PTR_ERR(em));
8029                         em = NULL;
8030                         goto out_err_em;
8031                 }
8032                 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
8033                                             logical, &geom);
8034                 if (ret) {
8035                         status = errno_to_blk_status(ret);
8036                         goto out_err_em;
8037                 }
8038
8039                 clone_len = min(submit_len, geom.len);
8040                 ASSERT(clone_len <= UINT_MAX);
8041
8042                 /*
8043                  * This will never fail as it's passing GPF_NOFS and
8044                  * the allocation is backed by btrfs_bioset.
8045                  */
8046                 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
8047                 bio->bi_private = dip;
8048                 bio->bi_end_io = btrfs_end_dio_bio;
8049
8050                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
8051                         status = extract_ordered_extent(BTRFS_I(inode), bio,
8052                                                         file_offset);
8053                         if (status) {
8054                                 bio_put(bio);
8055                                 goto out_err;
8056                         }
8057                 }
8058
8059                 ASSERT(submit_len >= clone_len);
8060                 submit_len -= clone_len;
8061
8062                 /*
8063                  * Increase the count before we submit the bio so we know
8064                  * the end IO handler won't happen before we increase the
8065                  * count. Otherwise, the dip might get freed before we're
8066                  * done setting it up.
8067                  *
8068                  * We transfer the initial reference to the last bio, so we
8069                  * don't need to increment the reference count for the last one.
8070                  */
8071                 if (submit_len > 0) {
8072                         refcount_inc(&dip->refs);
8073                         /*
8074                          * If we are submitting more than one bio, submit them
8075                          * all asynchronously. The exception is RAID 5 or 6, as
8076                          * asynchronous checksums make it difficult to collect
8077                          * full stripe writes.
8078                          */
8079                         if (!raid56)
8080                                 async_submit = 1;
8081                 }
8082
8083                 status = btrfs_submit_dio_bio(bio, inode, file_offset,
8084                                                 async_submit);
8085                 if (status) {
8086                         bio_put(bio);
8087                         if (submit_len > 0)
8088                                 refcount_dec(&dip->refs);
8089                         goto out_err_em;
8090                 }
8091
8092                 dio_data->submitted += clone_len;
8093                 clone_offset += clone_len;
8094                 start_sector += clone_len >> 9;
8095                 file_offset += clone_len;
8096
8097                 free_extent_map(em);
8098         } while (submit_len > 0);
8099         return;
8100
8101 out_err_em:
8102         free_extent_map(em);
8103 out_err:
8104         dip->dio_bio->bi_status = status;
8105         btrfs_dio_private_put(dip);
8106 }
8107
8108 const struct iomap_ops btrfs_dio_iomap_ops = {
8109         .iomap_begin            = btrfs_dio_iomap_begin,
8110         .iomap_end              = btrfs_dio_iomap_end,
8111 };
8112
8113 const struct iomap_dio_ops btrfs_dio_ops = {
8114         .submit_io              = btrfs_submit_direct,
8115 };
8116
8117 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8118                         u64 start, u64 len)
8119 {
8120         int     ret;
8121
8122         ret = fiemap_prep(inode, fieinfo, start, &len, 0);
8123         if (ret)
8124                 return ret;
8125
8126         return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
8127 }
8128
8129 int btrfs_readpage(struct file *file, struct page *page)
8130 {
8131         struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
8132         u64 start = page_offset(page);
8133         u64 end = start + PAGE_SIZE - 1;
8134         struct btrfs_bio_ctrl bio_ctrl = { 0 };
8135         int ret;
8136
8137         btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
8138
8139         ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
8140         if (bio_ctrl.bio) {
8141                 int ret2;
8142
8143                 ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
8144                 if (ret == 0)
8145                         ret = ret2;
8146         }
8147         return ret;
8148 }
8149
8150 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8151 {
8152         struct inode *inode = page->mapping->host;
8153         int ret;
8154
8155         if (current->flags & PF_MEMALLOC) {
8156                 redirty_page_for_writepage(wbc, page);
8157                 unlock_page(page);
8158                 return 0;
8159         }
8160
8161         /*
8162          * If we are under memory pressure we will call this directly from the
8163          * VM, we need to make sure we have the inode referenced for the ordered
8164          * extent.  If not just return like we didn't do anything.
8165          */
8166         if (!igrab(inode)) {
8167                 redirty_page_for_writepage(wbc, page);
8168                 return AOP_WRITEPAGE_ACTIVATE;
8169         }
8170         ret = extent_write_full_page(page, wbc);
8171         btrfs_add_delayed_iput(inode);
8172         return ret;
8173 }
8174
8175 static int btrfs_writepages(struct address_space *mapping,
8176                             struct writeback_control *wbc)
8177 {
8178         return extent_writepages(mapping, wbc);
8179 }
8180
8181 static void btrfs_readahead(struct readahead_control *rac)
8182 {
8183         extent_readahead(rac);
8184 }
8185
8186 /*
8187  * For releasepage() and invalidate_folio() we have a race window where
8188  * folio_end_writeback() is called but the subpage spinlock is not yet released.
8189  * If we continue to release/invalidate the page, we could cause use-after-free
8190  * for subpage spinlock.  So this function is to spin and wait for subpage
8191  * spinlock.
8192  */
8193 static void wait_subpage_spinlock(struct page *page)
8194 {
8195         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
8196         struct btrfs_subpage *subpage;
8197
8198         if (fs_info->sectorsize == PAGE_SIZE)
8199                 return;
8200
8201         ASSERT(PagePrivate(page) && page->private);
8202         subpage = (struct btrfs_subpage *)page->private;
8203
8204         /*
8205          * This may look insane as we just acquire the spinlock and release it,
8206          * without doing anything.  But we just want to make sure no one is
8207          * still holding the subpage spinlock.
8208          * And since the page is not dirty nor writeback, and we have page
8209          * locked, the only possible way to hold a spinlock is from the endio
8210          * function to clear page writeback.
8211          *
8212          * Here we just acquire the spinlock so that all existing callers
8213          * should exit and we're safe to release/invalidate the page.
8214          */
8215         spin_lock_irq(&subpage->lock);
8216         spin_unlock_irq(&subpage->lock);
8217 }
8218
8219 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8220 {
8221         int ret = try_release_extent_mapping(page, gfp_flags);
8222
8223         if (ret == 1) {
8224                 wait_subpage_spinlock(page);
8225                 clear_page_extent_mapped(page);
8226         }
8227         return ret;
8228 }
8229
8230 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8231 {
8232         if (PageWriteback(page) || PageDirty(page))
8233                 return 0;
8234         return __btrfs_releasepage(page, gfp_flags);
8235 }
8236
8237 #ifdef CONFIG_MIGRATION
8238 static int btrfs_migratepage(struct address_space *mapping,
8239                              struct page *newpage, struct page *page,
8240                              enum migrate_mode mode)
8241 {
8242         int ret;
8243
8244         ret = migrate_page_move_mapping(mapping, newpage, page, 0);
8245         if (ret != MIGRATEPAGE_SUCCESS)
8246                 return ret;
8247
8248         if (page_has_private(page))
8249                 attach_page_private(newpage, detach_page_private(page));
8250
8251         if (PageOrdered(page)) {
8252                 ClearPageOrdered(page);
8253                 SetPageOrdered(newpage);
8254         }
8255
8256         if (mode != MIGRATE_SYNC_NO_COPY)
8257                 migrate_page_copy(newpage, page);
8258         else
8259                 migrate_page_states(newpage, page);
8260         return MIGRATEPAGE_SUCCESS;
8261 }
8262 #endif
8263
8264 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
8265                                  size_t length)
8266 {
8267         struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
8268         struct btrfs_fs_info *fs_info = inode->root->fs_info;
8269         struct extent_io_tree *tree = &inode->io_tree;
8270         struct extent_state *cached_state = NULL;
8271         u64 page_start = folio_pos(folio);
8272         u64 page_end = page_start + folio_size(folio) - 1;
8273         u64 cur;
8274         int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
8275
8276         /*
8277          * We have folio locked so no new ordered extent can be created on this
8278          * page, nor bio can be submitted for this folio.
8279          *
8280          * But already submitted bio can still be finished on this folio.
8281          * Furthermore, endio function won't skip folio which has Ordered
8282          * (Private2) already cleared, so it's possible for endio and
8283          * invalidate_folio to do the same ordered extent accounting twice
8284          * on one folio.
8285          *
8286          * So here we wait for any submitted bios to finish, so that we won't
8287          * do double ordered extent accounting on the same folio.
8288          */
8289         folio_wait_writeback(folio);
8290         wait_subpage_spinlock(&folio->page);
8291
8292         /*
8293          * For subpage case, we have call sites like
8294          * btrfs_punch_hole_lock_range() which passes range not aligned to
8295          * sectorsize.
8296          * If the range doesn't cover the full folio, we don't need to and
8297          * shouldn't clear page extent mapped, as folio->private can still
8298          * record subpage dirty bits for other part of the range.
8299          *
8300          * For cases that invalidate the full folio even the range doesn't
8301          * cover the full folio, like invalidating the last folio, we're
8302          * still safe to wait for ordered extent to finish.
8303          */
8304         if (!(offset == 0 && length == folio_size(folio))) {
8305                 btrfs_releasepage(&folio->page, GFP_NOFS);
8306                 return;
8307         }
8308
8309         if (!inode_evicting)
8310                 lock_extent_bits(tree, page_start, page_end, &cached_state);
8311
8312         cur = page_start;
8313         while (cur < page_end) {
8314                 struct btrfs_ordered_extent *ordered;
8315                 bool delete_states;
8316                 u64 range_end;
8317                 u32 range_len;
8318
8319                 ordered = btrfs_lookup_first_ordered_range(inode, cur,
8320                                                            page_end + 1 - cur);
8321                 if (!ordered) {
8322                         range_end = page_end;
8323                         /*
8324                          * No ordered extent covering this range, we are safe
8325                          * to delete all extent states in the range.
8326                          */
8327                         delete_states = true;
8328                         goto next;
8329                 }
8330                 if (ordered->file_offset > cur) {
8331                         /*
8332                          * There is a range between [cur, oe->file_offset) not
8333                          * covered by any ordered extent.
8334                          * We are safe to delete all extent states, and handle
8335                          * the ordered extent in the next iteration.
8336                          */
8337                         range_end = ordered->file_offset - 1;
8338                         delete_states = true;
8339                         goto next;
8340                 }
8341
8342                 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8343                                 page_end);
8344                 ASSERT(range_end + 1 - cur < U32_MAX);
8345                 range_len = range_end + 1 - cur;
8346                 if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
8347                         /*
8348                          * If Ordered (Private2) is cleared, it means endio has
8349                          * already been executed for the range.
8350                          * We can't delete the extent states as
8351                          * btrfs_finish_ordered_io() may still use some of them.
8352                          */
8353                         delete_states = false;
8354                         goto next;
8355                 }
8356                 btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
8357
8358                 /*
8359                  * IO on this page will never be started, so we need to account
8360                  * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8361                  * here, must leave that up for the ordered extent completion.
8362                  *
8363                  * This will also unlock the range for incoming
8364                  * btrfs_finish_ordered_io().
8365                  */
8366                 if (!inode_evicting)
8367                         clear_extent_bit(tree, cur, range_end,
8368                                          EXTENT_DELALLOC |
8369                                          EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8370                                          EXTENT_DEFRAG, 1, 0, &cached_state);
8371
8372                 spin_lock_irq(&inode->ordered_tree.lock);
8373                 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8374                 ordered->truncated_len = min(ordered->truncated_len,
8375                                              cur - ordered->file_offset);
8376                 spin_unlock_irq(&inode->ordered_tree.lock);
8377
8378                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
8379                                                    cur, range_end + 1 - cur)) {
8380                         btrfs_finish_ordered_io(ordered);
8381                         /*
8382                          * The ordered extent has finished, now we're again
8383                          * safe to delete all extent states of the range.
8384                          */
8385                         delete_states = true;
8386                 } else {
8387                         /*
8388                          * btrfs_finish_ordered_io() will get executed by endio
8389                          * of other pages, thus we can't delete extent states
8390                          * anymore
8391                          */
8392                         delete_states = false;
8393                 }
8394 next:
8395                 if (ordered)
8396                         btrfs_put_ordered_extent(ordered);
8397                 /*
8398                  * Qgroup reserved space handler
8399                  * Sector(s) here will be either:
8400                  *
8401                  * 1) Already written to disk or bio already finished
8402                  *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8403                  *    Qgroup will be handled by its qgroup_record then.
8404                  *    btrfs_qgroup_free_data() call will do nothing here.
8405                  *
8406                  * 2) Not written to disk yet
8407                  *    Then btrfs_qgroup_free_data() call will clear the
8408                  *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8409                  *    reserved data space.
8410                  *    Since the IO will never happen for this page.
8411                  */
8412                 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
8413                 if (!inode_evicting) {
8414                         clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8415                                  EXTENT_DELALLOC | EXTENT_UPTODATE |
8416                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
8417                                  delete_states, &cached_state);
8418                 }
8419                 cur = range_end + 1;
8420         }
8421         /*
8422          * We have iterated through all ordered extents of the page, the page
8423          * should not have Ordered (Private2) anymore, or the above iteration
8424          * did something wrong.
8425          */
8426         ASSERT(!folio_test_ordered(folio));
8427         btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
8428         if (!inode_evicting)
8429                 __btrfs_releasepage(&folio->page, GFP_NOFS);
8430         clear_page_extent_mapped(&folio->page);
8431 }
8432
8433 /*
8434  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8435  * called from a page fault handler when a page is first dirtied. Hence we must
8436  * be careful to check for EOF conditions here. We set the page up correctly
8437  * for a written page which means we get ENOSPC checking when writing into
8438  * holes and correct delalloc and unwritten extent mapping on filesystems that
8439  * support these features.
8440  *
8441  * We are not allowed to take the i_mutex here so we have to play games to
8442  * protect against truncate races as the page could now be beyond EOF.  Because
8443  * truncate_setsize() writes the inode size before removing pages, once we have
8444  * the page lock we can determine safely if the page is beyond EOF. If it is not
8445  * beyond EOF, then the page is guaranteed safe against truncation until we
8446  * unlock the page.
8447  */
8448 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8449 {
8450         struct page *page = vmf->page;
8451         struct inode *inode = file_inode(vmf->vma->vm_file);
8452         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8453         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8454         struct btrfs_ordered_extent *ordered;
8455         struct extent_state *cached_state = NULL;
8456         struct extent_changeset *data_reserved = NULL;
8457         unsigned long zero_start;
8458         loff_t size;
8459         vm_fault_t ret;
8460         int ret2;
8461         int reserved = 0;
8462         u64 reserved_space;
8463         u64 page_start;
8464         u64 page_end;
8465         u64 end;
8466
8467         reserved_space = PAGE_SIZE;
8468
8469         sb_start_pagefault(inode->i_sb);
8470         page_start = page_offset(page);
8471         page_end = page_start + PAGE_SIZE - 1;
8472         end = page_end;
8473
8474         /*
8475          * Reserving delalloc space after obtaining the page lock can lead to
8476          * deadlock. For example, if a dirty page is locked by this function
8477          * and the call to btrfs_delalloc_reserve_space() ends up triggering
8478          * dirty page write out, then the btrfs_writepage() function could
8479          * end up waiting indefinitely to get a lock on the page currently
8480          * being processed by btrfs_page_mkwrite() function.
8481          */
8482         ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8483                                             page_start, reserved_space);
8484         if (!ret2) {
8485                 ret2 = file_update_time(vmf->vma->vm_file);
8486                 reserved = 1;
8487         }
8488         if (ret2) {
8489                 ret = vmf_error(ret2);
8490                 if (reserved)
8491                         goto out;
8492                 goto out_noreserve;
8493         }
8494
8495         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8496 again:
8497         down_read(&BTRFS_I(inode)->i_mmap_lock);
8498         lock_page(page);
8499         size = i_size_read(inode);
8500
8501         if ((page->mapping != inode->i_mapping) ||
8502             (page_start >= size)) {
8503                 /* page got truncated out from underneath us */
8504                 goto out_unlock;
8505         }
8506         wait_on_page_writeback(page);
8507
8508         lock_extent_bits(io_tree, page_start, page_end, &cached_state);
8509         ret2 = set_page_extent_mapped(page);
8510         if (ret2 < 0) {
8511                 ret = vmf_error(ret2);
8512                 unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
8513                 goto out_unlock;
8514         }
8515
8516         /*
8517          * we can't set the delalloc bits if there are pending ordered
8518          * extents.  Drop our locks and wait for them to finish
8519          */
8520         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8521                         PAGE_SIZE);
8522         if (ordered) {
8523                 unlock_extent_cached(io_tree, page_start, page_end,
8524                                      &cached_state);
8525                 unlock_page(page);
8526                 up_read(&BTRFS_I(inode)->i_mmap_lock);
8527                 btrfs_start_ordered_extent(ordered, 1);
8528                 btrfs_put_ordered_extent(ordered);
8529                 goto again;
8530         }
8531
8532         if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8533                 reserved_space = round_up(size - page_start,
8534                                           fs_info->sectorsize);
8535                 if (reserved_space < PAGE_SIZE) {
8536                         end = page_start + reserved_space - 1;
8537                         btrfs_delalloc_release_space(BTRFS_I(inode),
8538                                         data_reserved, page_start,
8539                                         PAGE_SIZE - reserved_space, true);
8540                 }
8541         }
8542
8543         /*
8544          * page_mkwrite gets called when the page is firstly dirtied after it's
8545          * faulted in, but write(2) could also dirty a page and set delalloc
8546          * bits, thus in this case for space account reason, we still need to
8547          * clear any delalloc bits within this page range since we have to
8548          * reserve data&meta space before lock_page() (see above comments).
8549          */
8550         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8551                           EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8552                           EXTENT_DEFRAG, 0, 0, &cached_state);
8553
8554         ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8555                                         &cached_state);
8556         if (ret2) {
8557                 unlock_extent_cached(io_tree, page_start, page_end,
8558                                      &cached_state);
8559                 ret = VM_FAULT_SIGBUS;
8560                 goto out_unlock;
8561         }
8562
8563         /* page is wholly or partially inside EOF */
8564         if (page_start + PAGE_SIZE > size)
8565                 zero_start = offset_in_page(size);
8566         else
8567                 zero_start = PAGE_SIZE;
8568
8569         if (zero_start != PAGE_SIZE) {
8570                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8571                 flush_dcache_page(page);
8572         }
8573         btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8574         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8575         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8576
8577         btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8578
8579         unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
8580         up_read(&BTRFS_I(inode)->i_mmap_lock);
8581
8582         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8583         sb_end_pagefault(inode->i_sb);
8584         extent_changeset_free(data_reserved);
8585         return VM_FAULT_LOCKED;
8586
8587 out_unlock:
8588         unlock_page(page);
8589         up_read(&BTRFS_I(inode)->i_mmap_lock);
8590 out:
8591         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8592         btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8593                                      reserved_space, (ret != 0));
8594 out_noreserve:
8595         sb_end_pagefault(inode->i_sb);
8596         extent_changeset_free(data_reserved);
8597         return ret;
8598 }
8599
8600 static int btrfs_truncate(struct inode *inode, bool skip_writeback)
8601 {
8602         struct btrfs_truncate_control control = {
8603                 .inode = BTRFS_I(inode),
8604                 .ino = btrfs_ino(BTRFS_I(inode)),
8605                 .min_type = BTRFS_EXTENT_DATA_KEY,
8606                 .clear_extent_range = true,
8607         };
8608         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8609         struct btrfs_root *root = BTRFS_I(inode)->root;
8610         struct btrfs_block_rsv *rsv;
8611         int ret;
8612         struct btrfs_trans_handle *trans;
8613         u64 mask = fs_info->sectorsize - 1;
8614         u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8615
8616         if (!skip_writeback) {
8617                 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
8618                                                (u64)-1);
8619                 if (ret)
8620                         return ret;
8621         }
8622
8623         /*
8624          * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8625          * things going on here:
8626          *
8627          * 1) We need to reserve space to update our inode.
8628          *
8629          * 2) We need to have something to cache all the space that is going to
8630          * be free'd up by the truncate operation, but also have some slack
8631          * space reserved in case it uses space during the truncate (thank you
8632          * very much snapshotting).
8633          *
8634          * And we need these to be separate.  The fact is we can use a lot of
8635          * space doing the truncate, and we have no earthly idea how much space
8636          * we will use, so we need the truncate reservation to be separate so it
8637          * doesn't end up using space reserved for updating the inode.  We also
8638          * need to be able to stop the transaction and start a new one, which
8639          * means we need to be able to update the inode several times, and we
8640          * have no idea of knowing how many times that will be, so we can't just
8641          * reserve 1 item for the entirety of the operation, so that has to be
8642          * done separately as well.
8643          *
8644          * So that leaves us with
8645          *
8646          * 1) rsv - for the truncate reservation, which we will steal from the
8647          * transaction reservation.
8648          * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8649          * updating the inode.
8650          */
8651         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8652         if (!rsv)
8653                 return -ENOMEM;
8654         rsv->size = min_size;
8655         rsv->failfast = 1;
8656
8657         /*
8658          * 1 for the truncate slack space
8659          * 1 for updating the inode.
8660          */
8661         trans = btrfs_start_transaction(root, 2);
8662         if (IS_ERR(trans)) {
8663                 ret = PTR_ERR(trans);
8664                 goto out;
8665         }
8666
8667         /* Migrate the slack space for the truncate to our reserve */
8668         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8669                                       min_size, false);
8670         BUG_ON(ret);
8671
8672         trans->block_rsv = rsv;
8673
8674         while (1) {
8675                 struct extent_state *cached_state = NULL;
8676                 const u64 new_size = inode->i_size;
8677                 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8678
8679                 control.new_size = new_size;
8680                 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
8681                                  &cached_state);
8682                 /*
8683                  * We want to drop from the next block forward in case this new
8684                  * size is not block aligned since we will be keeping the last
8685                  * block of the extent just the way it is.
8686                  */
8687                 btrfs_drop_extent_cache(BTRFS_I(inode),
8688                                         ALIGN(new_size, fs_info->sectorsize),
8689                                         (u64)-1, 0);
8690
8691                 ret = btrfs_truncate_inode_items(trans, root, &control);
8692
8693                 inode_sub_bytes(inode, control.sub_bytes);
8694                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
8695
8696                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
8697                                      (u64)-1, &cached_state);
8698
8699                 trans->block_rsv = &fs_info->trans_block_rsv;
8700                 if (ret != -ENOSPC && ret != -EAGAIN)
8701                         break;
8702
8703                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
8704                 if (ret)
8705                         break;
8706
8707                 btrfs_end_transaction(trans);
8708                 btrfs_btree_balance_dirty(fs_info);
8709
8710                 trans = btrfs_start_transaction(root, 2);
8711                 if (IS_ERR(trans)) {
8712                         ret = PTR_ERR(trans);
8713                         trans = NULL;
8714                         break;
8715                 }
8716
8717                 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8718                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8719                                               rsv, min_size, false);
8720                 BUG_ON(ret);    /* shouldn't happen */
8721                 trans->block_rsv = rsv;
8722         }
8723
8724         /*
8725          * We can't call btrfs_truncate_block inside a trans handle as we could
8726          * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8727          * know we've truncated everything except the last little bit, and can
8728          * do btrfs_truncate_block and then update the disk_i_size.
8729          */
8730         if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8731                 btrfs_end_transaction(trans);
8732                 btrfs_btree_balance_dirty(fs_info);
8733
8734                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
8735                 if (ret)
8736                         goto out;
8737                 trans = btrfs_start_transaction(root, 1);
8738                 if (IS_ERR(trans)) {
8739                         ret = PTR_ERR(trans);
8740                         goto out;
8741                 }
8742                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
8743         }
8744
8745         if (trans) {
8746                 int ret2;
8747
8748                 trans->block_rsv = &fs_info->trans_block_rsv;
8749                 ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
8750                 if (ret2 && !ret)
8751                         ret = ret2;
8752
8753                 ret2 = btrfs_end_transaction(trans);
8754                 if (ret2 && !ret)
8755                         ret = ret2;
8756                 btrfs_btree_balance_dirty(fs_info);
8757         }
8758 out:
8759         btrfs_free_block_rsv(fs_info, rsv);
8760         /*
8761          * So if we truncate and then write and fsync we normally would just
8762          * write the extents that changed, which is a problem if we need to
8763          * first truncate that entire inode.  So set this flag so we write out
8764          * all of the extents in the inode to the sync log so we're completely
8765          * safe.
8766          *
8767          * If no extents were dropped or trimmed we don't need to force the next
8768          * fsync to truncate all the inode's items from the log and re-log them
8769          * all. This means the truncate operation did not change the file size,
8770          * or changed it to a smaller size but there was only an implicit hole
8771          * between the old i_size and the new i_size, and there were no prealloc
8772          * extents beyond i_size to drop.
8773          */
8774         if (control.extents_found > 0)
8775                 btrfs_set_inode_full_sync(BTRFS_I(inode));
8776
8777         return ret;
8778 }
8779
8780 /*
8781  * create a new subvolume directory/inode (helper for the ioctl).
8782  */
8783 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8784                              struct btrfs_root *new_root,
8785                              struct btrfs_root *parent_root,
8786                              struct user_namespace *mnt_userns)
8787 {
8788         struct inode *inode;
8789         int err;
8790         u64 index = 0;
8791         u64 ino;
8792
8793         err = btrfs_get_free_objectid(new_root, &ino);
8794         if (err < 0)
8795                 return err;
8796
8797         inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
8798                                 ino, ino,
8799                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
8800                                 &index);
8801         if (IS_ERR(inode))
8802                 return PTR_ERR(inode);
8803         inode->i_op = &btrfs_dir_inode_operations;
8804         inode->i_fop = &btrfs_dir_file_operations;
8805
8806         set_nlink(inode, 1);
8807         btrfs_i_size_write(BTRFS_I(inode), 0);
8808         unlock_new_inode(inode);
8809
8810         err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8811         if (err)
8812                 btrfs_err(new_root->fs_info,
8813                           "error inheriting subvolume %llu properties: %d",
8814                           new_root->root_key.objectid, err);
8815
8816         err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
8817
8818         iput(inode);
8819         return err;
8820 }
8821
8822 struct inode *btrfs_alloc_inode(struct super_block *sb)
8823 {
8824         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8825         struct btrfs_inode *ei;
8826         struct inode *inode;
8827
8828         ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
8829         if (!ei)
8830                 return NULL;
8831
8832         ei->root = NULL;
8833         ei->generation = 0;
8834         ei->last_trans = 0;
8835         ei->last_sub_trans = 0;
8836         ei->logged_trans = 0;
8837         ei->delalloc_bytes = 0;
8838         ei->new_delalloc_bytes = 0;
8839         ei->defrag_bytes = 0;
8840         ei->disk_i_size = 0;
8841         ei->flags = 0;
8842         ei->ro_flags = 0;
8843         ei->csum_bytes = 0;
8844         ei->index_cnt = (u64)-1;
8845         ei->dir_index = 0;
8846         ei->last_unlink_trans = 0;
8847         ei->last_reflink_trans = 0;
8848         ei->last_log_commit = 0;
8849
8850         spin_lock_init(&ei->lock);
8851         ei->outstanding_extents = 0;
8852         if (sb->s_magic != BTRFS_TEST_MAGIC)
8853                 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
8854                                               BTRFS_BLOCK_RSV_DELALLOC);
8855         ei->runtime_flags = 0;
8856         ei->prop_compress = BTRFS_COMPRESS_NONE;
8857         ei->defrag_compress = BTRFS_COMPRESS_NONE;
8858
8859         ei->delayed_node = NULL;
8860
8861         ei->i_otime.tv_sec = 0;
8862         ei->i_otime.tv_nsec = 0;
8863
8864         inode = &ei->vfs_inode;
8865         extent_map_tree_init(&ei->extent_tree);
8866         extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
8867         extent_io_tree_init(fs_info, &ei->io_failure_tree,
8868                             IO_TREE_INODE_IO_FAILURE, inode);
8869         extent_io_tree_init(fs_info, &ei->file_extent_tree,
8870                             IO_TREE_INODE_FILE_EXTENT, inode);
8871         ei->io_tree.track_uptodate = true;
8872         ei->io_failure_tree.track_uptodate = true;
8873         atomic_set(&ei->sync_writers, 0);
8874         mutex_init(&ei->log_mutex);
8875         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8876         INIT_LIST_HEAD(&ei->delalloc_inodes);
8877         INIT_LIST_HEAD(&ei->delayed_iput);
8878         RB_CLEAR_NODE(&ei->rb_node);
8879         init_rwsem(&ei->i_mmap_lock);
8880
8881         return inode;
8882 }
8883
8884 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8885 void btrfs_test_destroy_inode(struct inode *inode)
8886 {
8887         btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
8888         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8889 }
8890 #endif
8891
8892 void btrfs_free_inode(struct inode *inode)
8893 {
8894         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8895 }
8896
8897 void btrfs_destroy_inode(struct inode *vfs_inode)
8898 {
8899         struct btrfs_ordered_extent *ordered;
8900         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8901         struct btrfs_root *root = inode->root;
8902
8903         WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8904         WARN_ON(vfs_inode->i_data.nrpages);
8905         WARN_ON(inode->block_rsv.reserved);
8906         WARN_ON(inode->block_rsv.size);
8907         WARN_ON(inode->outstanding_extents);
8908         if (!S_ISDIR(vfs_inode->i_mode)) {
8909                 WARN_ON(inode->delalloc_bytes);
8910                 WARN_ON(inode->new_delalloc_bytes);
8911         }
8912         WARN_ON(inode->csum_bytes);
8913         WARN_ON(inode->defrag_bytes);
8914
8915         /*
8916          * This can happen where we create an inode, but somebody else also
8917          * created the same inode and we need to destroy the one we already
8918          * created.
8919          */
8920         if (!root)
8921                 return;
8922
8923         while (1) {
8924                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8925                 if (!ordered)
8926                         break;
8927                 else {
8928                         btrfs_err(root->fs_info,
8929                                   "found ordered extent %llu %llu on inode cleanup",
8930                                   ordered->file_offset, ordered->num_bytes);
8931                         btrfs_remove_ordered_extent(inode, ordered);
8932                         btrfs_put_ordered_extent(ordered);
8933                         btrfs_put_ordered_extent(ordered);
8934                 }
8935         }
8936         btrfs_qgroup_check_reserved_leak(inode);
8937         inode_tree_del(inode);
8938         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8939         btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8940         btrfs_put_root(inode->root);
8941 }
8942
8943 int btrfs_drop_inode(struct inode *inode)
8944 {
8945         struct btrfs_root *root = BTRFS_I(inode)->root;
8946
8947         if (root == NULL)
8948                 return 1;
8949
8950         /* the snap/subvol tree is on deleting */
8951         if (btrfs_root_refs(&root->root_item) == 0)
8952                 return 1;
8953         else
8954                 return generic_drop_inode(inode);
8955 }
8956
8957 static void init_once(void *foo)
8958 {
8959         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8960
8961         inode_init_once(&ei->vfs_inode);
8962 }
8963
8964 void __cold btrfs_destroy_cachep(void)
8965 {
8966         /*
8967          * Make sure all delayed rcu free inodes are flushed before we
8968          * destroy cache.
8969          */
8970         rcu_barrier();
8971         kmem_cache_destroy(btrfs_inode_cachep);
8972         kmem_cache_destroy(btrfs_trans_handle_cachep);
8973         kmem_cache_destroy(btrfs_path_cachep);
8974         kmem_cache_destroy(btrfs_free_space_cachep);
8975         kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
8976 }
8977
8978 int __init btrfs_init_cachep(void)
8979 {
8980         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8981                         sizeof(struct btrfs_inode), 0,
8982                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
8983                         init_once);
8984         if (!btrfs_inode_cachep)
8985                 goto fail;
8986
8987         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8988                         sizeof(struct btrfs_trans_handle), 0,
8989                         SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
8990         if (!btrfs_trans_handle_cachep)
8991                 goto fail;
8992
8993         btrfs_path_cachep = kmem_cache_create("btrfs_path",
8994                         sizeof(struct btrfs_path), 0,
8995                         SLAB_MEM_SPREAD, NULL);
8996         if (!btrfs_path_cachep)
8997                 goto fail;
8998
8999         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9000                         sizeof(struct btrfs_free_space), 0,
9001                         SLAB_MEM_SPREAD, NULL);
9002         if (!btrfs_free_space_cachep)
9003                 goto fail;
9004
9005         btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
9006                                                         PAGE_SIZE, PAGE_SIZE,
9007                                                         SLAB_MEM_SPREAD, NULL);
9008         if (!btrfs_free_space_bitmap_cachep)
9009                 goto fail;
9010
9011         return 0;
9012 fail:
9013         btrfs_destroy_cachep();
9014         return -ENOMEM;
9015 }
9016
9017 static int btrfs_getattr(struct user_namespace *mnt_userns,
9018                          const struct path *path, struct kstat *stat,
9019                          u32 request_mask, unsigned int flags)
9020 {
9021         u64 delalloc_bytes;
9022         u64 inode_bytes;
9023         struct inode *inode = d_inode(path->dentry);
9024         u32 blocksize = inode->i_sb->s_blocksize;
9025         u32 bi_flags = BTRFS_I(inode)->flags;
9026         u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
9027
9028         stat->result_mask |= STATX_BTIME;
9029         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
9030         stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
9031         if (bi_flags & BTRFS_INODE_APPEND)
9032                 stat->attributes |= STATX_ATTR_APPEND;
9033         if (bi_flags & BTRFS_INODE_COMPRESS)
9034                 stat->attributes |= STATX_ATTR_COMPRESSED;
9035         if (bi_flags & BTRFS_INODE_IMMUTABLE)
9036                 stat->attributes |= STATX_ATTR_IMMUTABLE;
9037         if (bi_flags & BTRFS_INODE_NODUMP)
9038                 stat->attributes |= STATX_ATTR_NODUMP;
9039         if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
9040                 stat->attributes |= STATX_ATTR_VERITY;
9041
9042         stat->attributes_mask |= (STATX_ATTR_APPEND |
9043                                   STATX_ATTR_COMPRESSED |
9044                                   STATX_ATTR_IMMUTABLE |
9045                                   STATX_ATTR_NODUMP);
9046
9047         generic_fillattr(mnt_userns, inode, stat);
9048         stat->dev = BTRFS_I(inode)->root->anon_dev;
9049
9050         spin_lock(&BTRFS_I(inode)->lock);
9051         delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9052         inode_bytes = inode_get_bytes(inode);
9053         spin_unlock(&BTRFS_I(inode)->lock);
9054         stat->blocks = (ALIGN(inode_bytes, blocksize) +
9055                         ALIGN(delalloc_bytes, blocksize)) >> 9;
9056         return 0;
9057 }
9058
9059 static int btrfs_rename_exchange(struct inode *old_dir,
9060                               struct dentry *old_dentry,
9061                               struct inode *new_dir,
9062                               struct dentry *new_dentry)
9063 {
9064         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9065         struct btrfs_trans_handle *trans;
9066         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9067         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9068         struct inode *new_inode = new_dentry->d_inode;
9069         struct inode *old_inode = old_dentry->d_inode;
9070         struct timespec64 ctime = current_time(old_inode);
9071         struct btrfs_rename_ctx old_rename_ctx;
9072         struct btrfs_rename_ctx new_rename_ctx;
9073         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9074         u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9075         u64 old_idx = 0;
9076         u64 new_idx = 0;
9077         int ret;
9078         int ret2;
9079         bool need_abort = false;
9080
9081         /*
9082          * For non-subvolumes allow exchange only within one subvolume, in the
9083          * same inode namespace. Two subvolumes (represented as directory) can
9084          * be exchanged as they're a logical link and have a fixed inode number.
9085          */
9086         if (root != dest &&
9087             (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
9088              new_ino != BTRFS_FIRST_FREE_OBJECTID))
9089                 return -EXDEV;
9090
9091         /* close the race window with snapshot create/destroy ioctl */
9092         if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
9093             new_ino == BTRFS_FIRST_FREE_OBJECTID)
9094                 down_read(&fs_info->subvol_sem);
9095
9096         /*
9097          * We want to reserve the absolute worst case amount of items.  So if
9098          * both inodes are subvols and we need to unlink them then that would
9099          * require 4 item modifications, but if they are both normal inodes it
9100          * would require 5 item modifications, so we'll assume their normal
9101          * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9102          * should cover the worst case number of items we'll modify.
9103          */
9104         trans = btrfs_start_transaction(root, 12);
9105         if (IS_ERR(trans)) {
9106                 ret = PTR_ERR(trans);
9107                 goto out_notrans;
9108         }
9109
9110         if (dest != root) {
9111                 ret = btrfs_record_root_in_trans(trans, dest);
9112                 if (ret)
9113                         goto out_fail;
9114         }
9115
9116         /*
9117          * We need to find a free sequence number both in the source and
9118          * in the destination directory for the exchange.
9119          */
9120         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9121         if (ret)
9122                 goto out_fail;
9123         ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9124         if (ret)
9125                 goto out_fail;
9126
9127         BTRFS_I(old_inode)->dir_index = 0ULL;
9128         BTRFS_I(new_inode)->dir_index = 0ULL;
9129
9130         /* Reference for the source. */
9131         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9132                 /* force full log commit if subvolume involved. */
9133                 btrfs_set_log_full_commit(trans);
9134         } else {
9135                 ret = btrfs_insert_inode_ref(trans, dest,
9136                                              new_dentry->d_name.name,
9137                                              new_dentry->d_name.len,
9138                                              old_ino,
9139                                              btrfs_ino(BTRFS_I(new_dir)),
9140                                              old_idx);
9141                 if (ret)
9142                         goto out_fail;
9143                 need_abort = true;
9144         }
9145
9146         /* And now for the dest. */
9147         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9148                 /* force full log commit if subvolume involved. */
9149                 btrfs_set_log_full_commit(trans);
9150         } else {
9151                 ret = btrfs_insert_inode_ref(trans, root,
9152                                              old_dentry->d_name.name,
9153                                              old_dentry->d_name.len,
9154                                              new_ino,
9155                                              btrfs_ino(BTRFS_I(old_dir)),
9156                                              new_idx);
9157                 if (ret) {
9158                         if (need_abort)
9159                                 btrfs_abort_transaction(trans, ret);
9160                         goto out_fail;
9161                 }
9162         }
9163
9164         /* Update inode version and ctime/mtime. */
9165         inode_inc_iversion(old_dir);
9166         inode_inc_iversion(new_dir);
9167         inode_inc_iversion(old_inode);
9168         inode_inc_iversion(new_inode);
9169         old_dir->i_ctime = old_dir->i_mtime = ctime;
9170         new_dir->i_ctime = new_dir->i_mtime = ctime;
9171         old_inode->i_ctime = ctime;
9172         new_inode->i_ctime = ctime;
9173
9174         if (old_dentry->d_parent != new_dentry->d_parent) {
9175                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9176                                 BTRFS_I(old_inode), 1);
9177                 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
9178                                 BTRFS_I(new_inode), 1);
9179         }
9180
9181         /* src is a subvolume */
9182         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9183                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9184         } else { /* src is an inode */
9185                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9186                                            BTRFS_I(old_dentry->d_inode),
9187                                            old_dentry->d_name.name,
9188                                            old_dentry->d_name.len,
9189                                            &old_rename_ctx);
9190                 if (!ret)
9191                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9192         }
9193         if (ret) {
9194                 btrfs_abort_transaction(trans, ret);
9195                 goto out_fail;
9196         }
9197
9198         /* dest is a subvolume */
9199         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9200                 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9201         } else { /* dest is an inode */
9202                 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9203                                            BTRFS_I(new_dentry->d_inode),
9204                                            new_dentry->d_name.name,
9205                                            new_dentry->d_name.len,
9206                                            &new_rename_ctx);
9207                 if (!ret)
9208                         ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
9209         }
9210         if (ret) {
9211                 btrfs_abort_transaction(trans, ret);
9212                 goto out_fail;
9213         }
9214
9215         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9216                              new_dentry->d_name.name,
9217                              new_dentry->d_name.len, 0, old_idx);
9218         if (ret) {
9219                 btrfs_abort_transaction(trans, ret);
9220                 goto out_fail;
9221         }
9222
9223         ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9224                              old_dentry->d_name.name,
9225                              old_dentry->d_name.len, 0, new_idx);
9226         if (ret) {
9227                 btrfs_abort_transaction(trans, ret);
9228                 goto out_fail;
9229         }
9230
9231         if (old_inode->i_nlink == 1)
9232                 BTRFS_I(old_inode)->dir_index = old_idx;
9233         if (new_inode->i_nlink == 1)
9234                 BTRFS_I(new_inode)->dir_index = new_idx;
9235
9236         /*
9237          * Now pin the logs of the roots. We do it to ensure that no other task
9238          * can sync the logs while we are in progress with the rename, because
9239          * that could result in an inconsistency in case any of the inodes that
9240          * are part of this rename operation were logged before.
9241          */
9242         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9243                 btrfs_pin_log_trans(root);
9244         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
9245                 btrfs_pin_log_trans(dest);
9246
9247         /* Do the log updates for all inodes. */
9248         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9249                 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9250                                    old_rename_ctx.index, new_dentry->d_parent);
9251         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
9252                 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
9253                                    new_rename_ctx.index, old_dentry->d_parent);
9254
9255         /* Now unpin the logs. */
9256         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9257                 btrfs_end_log_trans(root);
9258         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
9259                 btrfs_end_log_trans(dest);
9260 out_fail:
9261         ret2 = btrfs_end_transaction(trans);
9262         ret = ret ? ret : ret2;
9263 out_notrans:
9264         if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
9265             old_ino == BTRFS_FIRST_FREE_OBJECTID)
9266                 up_read(&fs_info->subvol_sem);
9267
9268         return ret;
9269 }
9270
9271 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9272                                      struct btrfs_root *root,
9273                                      struct user_namespace *mnt_userns,
9274                                      struct inode *dir,
9275                                      struct dentry *dentry)
9276 {
9277         int ret;
9278         struct inode *inode;
9279         u64 objectid;
9280         u64 index;
9281
9282         ret = btrfs_get_free_objectid(root, &objectid);
9283         if (ret)
9284                 return ret;
9285
9286         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
9287                                 dentry->d_name.name,
9288                                 dentry->d_name.len,
9289                                 btrfs_ino(BTRFS_I(dir)),
9290                                 objectid,
9291                                 S_IFCHR | WHITEOUT_MODE,
9292                                 &index);
9293
9294         if (IS_ERR(inode)) {
9295                 ret = PTR_ERR(inode);
9296                 return ret;
9297         }
9298
9299         inode->i_op = &btrfs_special_inode_operations;
9300         init_special_inode(inode, inode->i_mode,
9301                 WHITEOUT_DEV);
9302
9303         ret = btrfs_init_inode_security(trans, inode, dir,
9304                                 &dentry->d_name);
9305         if (ret)
9306                 goto out;
9307
9308         ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9309                                 BTRFS_I(inode), 0, index);
9310         if (ret)
9311                 goto out;
9312
9313         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9314 out:
9315         unlock_new_inode(inode);
9316         if (ret)
9317                 inode_dec_link_count(inode);
9318         iput(inode);
9319
9320         return ret;
9321 }
9322
9323 static int btrfs_rename(struct user_namespace *mnt_userns,
9324                         struct inode *old_dir, struct dentry *old_dentry,
9325                         struct inode *new_dir, struct dentry *new_dentry,
9326                         unsigned int flags)
9327 {
9328         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9329         struct btrfs_trans_handle *trans;
9330         unsigned int trans_num_items;
9331         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9332         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9333         struct inode *new_inode = d_inode(new_dentry);
9334         struct inode *old_inode = d_inode(old_dentry);
9335         struct btrfs_rename_ctx rename_ctx;
9336         u64 index = 0;
9337         int ret;
9338         int ret2;
9339         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9340
9341         if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9342                 return -EPERM;
9343
9344         /* we only allow rename subvolume link between subvolumes */
9345         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9346                 return -EXDEV;
9347
9348         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9349             (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9350                 return -ENOTEMPTY;
9351
9352         if (S_ISDIR(old_inode->i_mode) && new_inode &&
9353             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9354                 return -ENOTEMPTY;
9355
9356
9357         /* check for collisions, even if the  name isn't there */
9358         ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9359                              new_dentry->d_name.name,
9360                              new_dentry->d_name.len);
9361
9362         if (ret) {
9363                 if (ret == -EEXIST) {
9364                         /* we shouldn't get
9365                          * eexist without a new_inode */
9366                         if (WARN_ON(!new_inode)) {
9367                                 return ret;
9368                         }
9369                 } else {
9370                         /* maybe -EOVERFLOW */
9371                         return ret;
9372                 }
9373         }
9374         ret = 0;
9375
9376         /*
9377          * we're using rename to replace one file with another.  Start IO on it
9378          * now so  we don't add too much work to the end of the transaction
9379          */
9380         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9381                 filemap_flush(old_inode->i_mapping);
9382
9383         /* close the racy window with snapshot create/destroy ioctl */
9384         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9385                 down_read(&fs_info->subvol_sem);
9386         /*
9387          * We want to reserve the absolute worst case amount of items.  So if
9388          * both inodes are subvols and we need to unlink them then that would
9389          * require 4 item modifications, but if they are both normal inodes it
9390          * would require 5 item modifications, so we'll assume they are normal
9391          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9392          * should cover the worst case number of items we'll modify.
9393          * If our rename has the whiteout flag, we need more 5 units for the
9394          * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9395          * when selinux is enabled).
9396          */
9397         trans_num_items = 11;
9398         if (flags & RENAME_WHITEOUT)
9399                 trans_num_items += 5;
9400         trans = btrfs_start_transaction(root, trans_num_items);
9401         if (IS_ERR(trans)) {
9402                 ret = PTR_ERR(trans);
9403                 goto out_notrans;
9404         }
9405
9406         if (dest != root) {
9407                 ret = btrfs_record_root_in_trans(trans, dest);
9408                 if (ret)
9409                         goto out_fail;
9410         }
9411
9412         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9413         if (ret)
9414                 goto out_fail;
9415
9416         BTRFS_I(old_inode)->dir_index = 0ULL;
9417         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9418                 /* force full log commit if subvolume involved. */
9419                 btrfs_set_log_full_commit(trans);
9420         } else {
9421                 ret = btrfs_insert_inode_ref(trans, dest,
9422                                              new_dentry->d_name.name,
9423                                              new_dentry->d_name.len,
9424                                              old_ino,
9425                                              btrfs_ino(BTRFS_I(new_dir)), index);
9426                 if (ret)
9427                         goto out_fail;
9428         }
9429
9430         inode_inc_iversion(old_dir);
9431         inode_inc_iversion(new_dir);
9432         inode_inc_iversion(old_inode);
9433         old_dir->i_ctime = old_dir->i_mtime =
9434         new_dir->i_ctime = new_dir->i_mtime =
9435         old_inode->i_ctime = current_time(old_dir);
9436
9437         if (old_dentry->d_parent != new_dentry->d_parent)
9438                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9439                                 BTRFS_I(old_inode), 1);
9440
9441         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9442                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9443         } else {
9444                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9445                                         BTRFS_I(d_inode(old_dentry)),
9446                                         old_dentry->d_name.name,
9447                                         old_dentry->d_name.len,
9448                                         &rename_ctx);
9449                 if (!ret)
9450                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9451         }
9452         if (ret) {
9453                 btrfs_abort_transaction(trans, ret);
9454                 goto out_fail;
9455         }
9456
9457         if (new_inode) {
9458                 inode_inc_iversion(new_inode);
9459                 new_inode->i_ctime = current_time(new_inode);
9460                 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9461                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9462                         ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9463                         BUG_ON(new_inode->i_nlink == 0);
9464                 } else {
9465                         ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9466                                                  BTRFS_I(d_inode(new_dentry)),
9467                                                  new_dentry->d_name.name,
9468                                                  new_dentry->d_name.len);
9469                 }
9470                 if (!ret && new_inode->i_nlink == 0)
9471                         ret = btrfs_orphan_add(trans,
9472                                         BTRFS_I(d_inode(new_dentry)));
9473                 if (ret) {
9474                         btrfs_abort_transaction(trans, ret);
9475                         goto out_fail;
9476                 }
9477         }
9478
9479         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9480                              new_dentry->d_name.name,
9481                              new_dentry->d_name.len, 0, index);
9482         if (ret) {
9483                 btrfs_abort_transaction(trans, ret);
9484                 goto out_fail;
9485         }
9486
9487         if (old_inode->i_nlink == 1)
9488                 BTRFS_I(old_inode)->dir_index = index;
9489
9490         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9491                 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9492                                    rename_ctx.index, new_dentry->d_parent);
9493
9494         if (flags & RENAME_WHITEOUT) {
9495                 ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
9496                                                 old_dir, old_dentry);
9497
9498                 if (ret) {
9499                         btrfs_abort_transaction(trans, ret);
9500                         goto out_fail;
9501                 }
9502         }
9503 out_fail:
9504         ret2 = btrfs_end_transaction(trans);
9505         ret = ret ? ret : ret2;
9506 out_notrans:
9507         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9508                 up_read(&fs_info->subvol_sem);
9509
9510         return ret;
9511 }
9512
9513 static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
9514                          struct dentry *old_dentry, struct inode *new_dir,
9515                          struct dentry *new_dentry, unsigned int flags)
9516 {
9517         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9518                 return -EINVAL;
9519
9520         if (flags & RENAME_EXCHANGE)
9521                 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9522                                           new_dentry);
9523
9524         return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
9525                             new_dentry, flags);
9526 }
9527
9528 struct btrfs_delalloc_work {
9529         struct inode *inode;
9530         struct completion completion;
9531         struct list_head list;
9532         struct btrfs_work work;
9533 };
9534
9535 static void btrfs_run_delalloc_work(struct btrfs_work *work)
9536 {
9537         struct btrfs_delalloc_work *delalloc_work;
9538         struct inode *inode;
9539
9540         delalloc_work = container_of(work, struct btrfs_delalloc_work,
9541                                      work);
9542         inode = delalloc_work->inode;
9543         filemap_flush(inode->i_mapping);
9544         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9545                                 &BTRFS_I(inode)->runtime_flags))
9546                 filemap_flush(inode->i_mapping);
9547
9548         iput(inode);
9549         complete(&delalloc_work->completion);
9550 }
9551
9552 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9553 {
9554         struct btrfs_delalloc_work *work;
9555
9556         work = kmalloc(sizeof(*work), GFP_NOFS);
9557         if (!work)
9558                 return NULL;
9559
9560         init_completion(&work->completion);
9561         INIT_LIST_HEAD(&work->list);
9562         work->inode = inode;
9563         btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9564
9565         return work;
9566 }
9567
9568 /*
9569  * some fairly slow code that needs optimization. This walks the list
9570  * of all the inodes with pending delalloc and forces them to disk.
9571  */
9572 static int start_delalloc_inodes(struct btrfs_root *root,
9573                                  struct writeback_control *wbc, bool snapshot,
9574                                  bool in_reclaim_context)
9575 {
9576         struct btrfs_inode *binode;
9577         struct inode *inode;
9578         struct btrfs_delalloc_work *work, *next;
9579         struct list_head works;
9580         struct list_head splice;
9581         int ret = 0;
9582         bool full_flush = wbc->nr_to_write == LONG_MAX;
9583
9584         INIT_LIST_HEAD(&works);
9585         INIT_LIST_HEAD(&splice);
9586
9587         mutex_lock(&root->delalloc_mutex);
9588         spin_lock(&root->delalloc_lock);
9589         list_splice_init(&root->delalloc_inodes, &splice);
9590         while (!list_empty(&splice)) {
9591                 binode = list_entry(splice.next, struct btrfs_inode,
9592                                     delalloc_inodes);
9593
9594                 list_move_tail(&binode->delalloc_inodes,
9595                                &root->delalloc_inodes);
9596
9597                 if (in_reclaim_context &&
9598                     test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9599                         continue;
9600
9601                 inode = igrab(&binode->vfs_inode);
9602                 if (!inode) {
9603                         cond_resched_lock(&root->delalloc_lock);
9604                         continue;
9605                 }
9606                 spin_unlock(&root->delalloc_lock);
9607
9608                 if (snapshot)
9609                         set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9610                                 &binode->runtime_flags);
9611                 if (full_flush) {
9612                         work = btrfs_alloc_delalloc_work(inode);
9613                         if (!work) {
9614                                 iput(inode);
9615                                 ret = -ENOMEM;
9616                                 goto out;
9617                         }
9618                         list_add_tail(&work->list, &works);
9619                         btrfs_queue_work(root->fs_info->flush_workers,
9620                                          &work->work);
9621                 } else {
9622                         ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9623                         btrfs_add_delayed_iput(inode);
9624                         if (ret || wbc->nr_to_write <= 0)
9625                                 goto out;
9626                 }
9627                 cond_resched();
9628                 spin_lock(&root->delalloc_lock);
9629         }
9630         spin_unlock(&root->delalloc_lock);
9631
9632 out:
9633         list_for_each_entry_safe(work, next, &works, list) {
9634                 list_del_init(&work->list);
9635                 wait_for_completion(&work->completion);
9636                 kfree(work);
9637         }
9638
9639         if (!list_empty(&splice)) {
9640                 spin_lock(&root->delalloc_lock);
9641                 list_splice_tail(&splice, &root->delalloc_inodes);
9642                 spin_unlock(&root->delalloc_lock);
9643         }
9644         mutex_unlock(&root->delalloc_mutex);
9645         return ret;
9646 }
9647
9648 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9649 {
9650         struct writeback_control wbc = {
9651                 .nr_to_write = LONG_MAX,
9652                 .sync_mode = WB_SYNC_NONE,
9653                 .range_start = 0,
9654                 .range_end = LLONG_MAX,
9655         };
9656         struct btrfs_fs_info *fs_info = root->fs_info;
9657
9658         if (BTRFS_FS_ERROR(fs_info))
9659                 return -EROFS;
9660
9661         return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9662 }
9663
9664 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9665                                bool in_reclaim_context)
9666 {
9667         struct writeback_control wbc = {
9668                 .nr_to_write = nr,
9669                 .sync_mode = WB_SYNC_NONE,
9670                 .range_start = 0,
9671                 .range_end = LLONG_MAX,
9672         };
9673         struct btrfs_root *root;
9674         struct list_head splice;
9675         int ret;
9676
9677         if (BTRFS_FS_ERROR(fs_info))
9678                 return -EROFS;
9679
9680         INIT_LIST_HEAD(&splice);
9681
9682         mutex_lock(&fs_info->delalloc_root_mutex);
9683         spin_lock(&fs_info->delalloc_root_lock);
9684         list_splice_init(&fs_info->delalloc_roots, &splice);
9685         while (!list_empty(&splice)) {
9686                 /*
9687                  * Reset nr_to_write here so we know that we're doing a full
9688                  * flush.
9689                  */
9690                 if (nr == LONG_MAX)
9691                         wbc.nr_to_write = LONG_MAX;
9692
9693                 root = list_first_entry(&splice, struct btrfs_root,
9694                                         delalloc_root);
9695                 root = btrfs_grab_root(root);
9696                 BUG_ON(!root);
9697                 list_move_tail(&root->delalloc_root,
9698                                &fs_info->delalloc_roots);
9699                 spin_unlock(&fs_info->delalloc_root_lock);
9700
9701                 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9702                 btrfs_put_root(root);
9703                 if (ret < 0 || wbc.nr_to_write <= 0)
9704                         goto out;
9705                 spin_lock(&fs_info->delalloc_root_lock);
9706         }
9707         spin_unlock(&fs_info->delalloc_root_lock);
9708
9709         ret = 0;
9710 out:
9711         if (!list_empty(&splice)) {
9712                 spin_lock(&fs_info->delalloc_root_lock);
9713                 list_splice_tail(&splice, &fs_info->delalloc_roots);
9714                 spin_unlock(&fs_info->delalloc_root_lock);
9715         }
9716         mutex_unlock(&fs_info->delalloc_root_mutex);
9717         return ret;
9718 }
9719
9720 static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
9721                          struct dentry *dentry, const char *symname)
9722 {
9723         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9724         struct btrfs_trans_handle *trans;
9725         struct btrfs_root *root = BTRFS_I(dir)->root;
9726         struct btrfs_path *path;
9727         struct btrfs_key key;
9728         struct inode *inode = NULL;
9729         int err;
9730         u64 objectid;
9731         u64 index = 0;
9732         int name_len;
9733         int datasize;
9734         unsigned long ptr;
9735         struct btrfs_file_extent_item *ei;
9736         struct extent_buffer *leaf;
9737
9738         name_len = strlen(symname);
9739         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
9740                 return -ENAMETOOLONG;
9741
9742         /*
9743          * 2 items for inode item and ref
9744          * 2 items for dir items
9745          * 1 item for updating parent inode item
9746          * 1 item for the inline extent item
9747          * 1 item for xattr if selinux is on
9748          */
9749         trans = btrfs_start_transaction(root, 7);
9750         if (IS_ERR(trans))
9751                 return PTR_ERR(trans);
9752
9753         err = btrfs_get_free_objectid(root, &objectid);
9754         if (err)
9755                 goto out_unlock;
9756
9757         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
9758                                 dentry->d_name.name, dentry->d_name.len,
9759                                 btrfs_ino(BTRFS_I(dir)), objectid,
9760                                 S_IFLNK | S_IRWXUGO, &index);
9761         if (IS_ERR(inode)) {
9762                 err = PTR_ERR(inode);
9763                 inode = NULL;
9764                 goto out_unlock;
9765         }
9766
9767         /*
9768         * If the active LSM wants to access the inode during
9769         * d_instantiate it needs these. Smack checks to see
9770         * if the filesystem supports xattrs by looking at the
9771         * ops vector.
9772         */
9773         inode->i_fop = &btrfs_file_operations;
9774         inode->i_op = &btrfs_file_inode_operations;
9775         inode->i_mapping->a_ops = &btrfs_aops;
9776
9777         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
9778         if (err)
9779                 goto out_unlock;
9780
9781         path = btrfs_alloc_path();
9782         if (!path) {
9783                 err = -ENOMEM;
9784                 goto out_unlock;
9785         }
9786         key.objectid = btrfs_ino(BTRFS_I(inode));
9787         key.offset = 0;
9788         key.type = BTRFS_EXTENT_DATA_KEY;
9789         datasize = btrfs_file_extent_calc_inline_size(name_len);
9790         err = btrfs_insert_empty_item(trans, root, path, &key,
9791                                       datasize);
9792         if (err) {
9793                 btrfs_free_path(path);
9794                 goto out_unlock;
9795         }
9796         leaf = path->nodes[0];
9797         ei = btrfs_item_ptr(leaf, path->slots[0],
9798                             struct btrfs_file_extent_item);
9799         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9800         btrfs_set_file_extent_type(leaf, ei,
9801                                    BTRFS_FILE_EXTENT_INLINE);
9802         btrfs_set_file_extent_encryption(leaf, ei, 0);
9803         btrfs_set_file_extent_compression(leaf, ei, 0);
9804         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9805         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9806
9807         ptr = btrfs_file_extent_inline_start(ei);
9808         write_extent_buffer(leaf, symname, ptr, name_len);
9809         btrfs_mark_buffer_dirty(leaf);
9810         btrfs_free_path(path);
9811
9812         inode->i_op = &btrfs_symlink_inode_operations;
9813         inode_nohighmem(inode);
9814         inode_set_bytes(inode, name_len);
9815         btrfs_i_size_write(BTRFS_I(inode), name_len);
9816         err = btrfs_update_inode(trans, root, BTRFS_I(inode));
9817         /*
9818          * Last step, add directory indexes for our symlink inode. This is the
9819          * last step to avoid extra cleanup of these indexes if an error happens
9820          * elsewhere above.
9821          */
9822         if (!err)
9823                 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9824                                 BTRFS_I(inode), 0, index);
9825         if (err)
9826                 goto out_unlock;
9827
9828         d_instantiate_new(dentry, inode);
9829
9830 out_unlock:
9831         btrfs_end_transaction(trans);
9832         if (err && inode) {
9833                 inode_dec_link_count(inode);
9834                 discard_new_inode(inode);
9835         }
9836         btrfs_btree_balance_dirty(fs_info);
9837         return err;
9838 }
9839
9840 static struct btrfs_trans_handle *insert_prealloc_file_extent(
9841                                        struct btrfs_trans_handle *trans_in,
9842                                        struct btrfs_inode *inode,
9843                                        struct btrfs_key *ins,
9844                                        u64 file_offset)
9845 {
9846         struct btrfs_file_extent_item stack_fi;
9847         struct btrfs_replace_extent_info extent_info;
9848         struct btrfs_trans_handle *trans = trans_in;
9849         struct btrfs_path *path;
9850         u64 start = ins->objectid;
9851         u64 len = ins->offset;
9852         int qgroup_released;
9853         int ret;
9854
9855         memset(&stack_fi, 0, sizeof(stack_fi));
9856
9857         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9858         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9859         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9860         btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9861         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9862         btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9863         /* Encryption and other encoding is reserved and all 0 */
9864
9865         qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
9866         if (qgroup_released < 0)
9867                 return ERR_PTR(qgroup_released);
9868
9869         if (trans) {
9870                 ret = insert_reserved_file_extent(trans, inode,
9871                                                   file_offset, &stack_fi,
9872                                                   true, qgroup_released);
9873                 if (ret)
9874                         goto free_qgroup;
9875                 return trans;
9876         }
9877
9878         extent_info.disk_offset = start;
9879         extent_info.disk_len = len;
9880         extent_info.data_offset = 0;
9881         extent_info.data_len = len;
9882         extent_info.file_offset = file_offset;
9883         extent_info.extent_buf = (char *)&stack_fi;
9884         extent_info.is_new_extent = true;
9885         extent_info.qgroup_reserved = qgroup_released;
9886         extent_info.insertions = 0;
9887
9888         path = btrfs_alloc_path();
9889         if (!path) {
9890                 ret = -ENOMEM;
9891                 goto free_qgroup;
9892         }
9893
9894         ret = btrfs_replace_file_extents(inode, path, file_offset,
9895                                      file_offset + len - 1, &extent_info,
9896                                      &trans);
9897         btrfs_free_path(path);
9898         if (ret)
9899                 goto free_qgroup;
9900         return trans;
9901
9902 free_qgroup:
9903         /*
9904          * We have released qgroup data range at the beginning of the function,
9905          * and normally qgroup_released bytes will be freed when committing
9906          * transaction.
9907          * But if we error out early, we have to free what we have released
9908          * or we leak qgroup data reservation.
9909          */
9910         btrfs_qgroup_free_refroot(inode->root->fs_info,
9911                         inode->root->root_key.objectid, qgroup_released,
9912                         BTRFS_QGROUP_RSV_DATA);
9913         return ERR_PTR(ret);
9914 }
9915
9916 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9917                                        u64 start, u64 num_bytes, u64 min_size,
9918                                        loff_t actual_len, u64 *alloc_hint,
9919                                        struct btrfs_trans_handle *trans)
9920 {
9921         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9922         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9923         struct extent_map *em;
9924         struct btrfs_root *root = BTRFS_I(inode)->root;
9925         struct btrfs_key ins;
9926         u64 cur_offset = start;
9927         u64 clear_offset = start;
9928         u64 i_size;
9929         u64 cur_bytes;
9930         u64 last_alloc = (u64)-1;
9931         int ret = 0;
9932         bool own_trans = true;
9933         u64 end = start + num_bytes - 1;
9934
9935         if (trans)
9936                 own_trans = false;
9937         while (num_bytes > 0) {
9938                 cur_bytes = min_t(u64, num_bytes, SZ_256M);
9939                 cur_bytes = max(cur_bytes, min_size);
9940                 /*
9941                  * If we are severely fragmented we could end up with really
9942                  * small allocations, so if the allocator is returning small
9943                  * chunks lets make its job easier by only searching for those
9944                  * sized chunks.
9945                  */
9946                 cur_bytes = min(cur_bytes, last_alloc);
9947                 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9948                                 min_size, 0, *alloc_hint, &ins, 1, 0);
9949                 if (ret)
9950                         break;
9951
9952                 /*
9953                  * We've reserved this space, and thus converted it from
9954                  * ->bytes_may_use to ->bytes_reserved.  Any error that happens
9955                  * from here on out we will only need to clear our reservation
9956                  * for the remaining unreserved area, so advance our
9957                  * clear_offset by our extent size.
9958                  */
9959                 clear_offset += ins.offset;
9960
9961                 last_alloc = ins.offset;
9962                 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9963                                                     &ins, cur_offset);
9964                 /*
9965                  * Now that we inserted the prealloc extent we can finally
9966                  * decrement the number of reservations in the block group.
9967                  * If we did it before, we could race with relocation and have
9968                  * relocation miss the reserved extent, making it fail later.
9969                  */
9970                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9971                 if (IS_ERR(trans)) {
9972                         ret = PTR_ERR(trans);
9973                         btrfs_free_reserved_extent(fs_info, ins.objectid,
9974                                                    ins.offset, 0);
9975                         break;
9976                 }
9977
9978                 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
9979                                         cur_offset + ins.offset -1, 0);
9980
9981                 em = alloc_extent_map();
9982                 if (!em) {
9983                         btrfs_set_inode_full_sync(BTRFS_I(inode));
9984                         goto next;
9985                 }
9986
9987                 em->start = cur_offset;
9988                 em->orig_start = cur_offset;
9989                 em->len = ins.offset;
9990                 em->block_start = ins.objectid;
9991                 em->block_len = ins.offset;
9992                 em->orig_block_len = ins.offset;
9993                 em->ram_bytes = ins.offset;
9994                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9995                 em->generation = trans->transid;
9996
9997                 while (1) {
9998                         write_lock(&em_tree->lock);
9999                         ret = add_extent_mapping(em_tree, em, 1);
10000                         write_unlock(&em_tree->lock);
10001                         if (ret != -EEXIST)
10002                                 break;
10003                         btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10004                                                 cur_offset + ins.offset - 1,
10005                                                 0);
10006                 }
10007                 free_extent_map(em);
10008 next:
10009                 num_bytes -= ins.offset;
10010                 cur_offset += ins.offset;
10011                 *alloc_hint = ins.objectid + ins.offset;
10012
10013                 inode_inc_iversion(inode);
10014                 inode->i_ctime = current_time(inode);
10015                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10016                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10017                     (actual_len > inode->i_size) &&
10018                     (cur_offset > inode->i_size)) {
10019                         if (cur_offset > actual_len)
10020                                 i_size = actual_len;
10021                         else
10022                                 i_size = cur_offset;
10023                         i_size_write(inode, i_size);
10024                         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
10025                 }
10026
10027                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
10028
10029                 if (ret) {
10030                         btrfs_abort_transaction(trans, ret);
10031                         if (own_trans)
10032                                 btrfs_end_transaction(trans);
10033                         break;
10034                 }
10035
10036                 if (own_trans) {
10037                         btrfs_end_transaction(trans);
10038                         trans = NULL;
10039                 }
10040         }
10041         if (clear_offset < end)
10042                 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
10043                         end - clear_offset + 1);
10044         return ret;
10045 }
10046
10047 int btrfs_prealloc_file_range(struct inode *inode, int mode,
10048                               u64 start, u64 num_bytes, u64 min_size,
10049                               loff_t actual_len, u64 *alloc_hint)
10050 {
10051         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10052                                            min_size, actual_len, alloc_hint,
10053                                            NULL);
10054 }
10055
10056 int btrfs_prealloc_file_range_trans(struct inode *inode,
10057                                     struct btrfs_trans_handle *trans, int mode,
10058                                     u64 start, u64 num_bytes, u64 min_size,
10059                                     loff_t actual_len, u64 *alloc_hint)
10060 {
10061         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10062                                            min_size, actual_len, alloc_hint, trans);
10063 }
10064
10065 static int btrfs_permission(struct user_namespace *mnt_userns,
10066                             struct inode *inode, int mask)
10067 {
10068         struct btrfs_root *root = BTRFS_I(inode)->root;
10069         umode_t mode = inode->i_mode;
10070
10071         if (mask & MAY_WRITE &&
10072             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10073                 if (btrfs_root_readonly(root))
10074                         return -EROFS;
10075                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10076                         return -EACCES;
10077         }
10078         return generic_permission(mnt_userns, inode, mask);
10079 }
10080
10081 static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
10082                          struct dentry *dentry, umode_t mode)
10083 {
10084         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10085         struct btrfs_trans_handle *trans;
10086         struct btrfs_root *root = BTRFS_I(dir)->root;
10087         struct inode *inode = NULL;
10088         u64 objectid;
10089         u64 index;
10090         int ret = 0;
10091
10092         /*
10093          * 5 units required for adding orphan entry
10094          */
10095         trans = btrfs_start_transaction(root, 5);
10096         if (IS_ERR(trans))
10097                 return PTR_ERR(trans);
10098
10099         ret = btrfs_get_free_objectid(root, &objectid);
10100         if (ret)
10101                 goto out;
10102
10103         inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
10104                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10105         if (IS_ERR(inode)) {
10106                 ret = PTR_ERR(inode);
10107                 inode = NULL;
10108                 goto out;
10109         }
10110
10111         inode->i_fop = &btrfs_file_operations;
10112         inode->i_op = &btrfs_file_inode_operations;
10113
10114         inode->i_mapping->a_ops = &btrfs_aops;
10115
10116         ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10117         if (ret)
10118                 goto out;
10119
10120         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
10121         if (ret)
10122                 goto out;
10123         ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10124         if (ret)
10125                 goto out;
10126
10127         /*
10128          * We set number of links to 0 in btrfs_new_inode(), and here we set
10129          * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10130          * through:
10131          *
10132          *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10133          */
10134         set_nlink(inode, 1);
10135         d_tmpfile(dentry, inode);
10136         unlock_new_inode(inode);
10137         mark_inode_dirty(inode);
10138 out:
10139         btrfs_end_transaction(trans);
10140         if (ret && inode)
10141                 discard_new_inode(inode);
10142         btrfs_btree_balance_dirty(fs_info);
10143         return ret;
10144 }
10145
10146 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
10147 {
10148         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10149         unsigned long index = start >> PAGE_SHIFT;
10150         unsigned long end_index = end >> PAGE_SHIFT;
10151         struct page *page;
10152         u32 len;
10153
10154         ASSERT(end + 1 - start <= U32_MAX);
10155         len = end + 1 - start;
10156         while (index <= end_index) {
10157                 page = find_get_page(inode->vfs_inode.i_mapping, index);
10158                 ASSERT(page); /* Pages should be in the extent_io_tree */
10159
10160                 btrfs_page_set_writeback(fs_info, page, start, len);
10161                 put_page(page);
10162                 index++;
10163         }
10164 }
10165
10166 static int btrfs_encoded_io_compression_from_extent(
10167                                 struct btrfs_fs_info *fs_info,
10168                                 int compress_type)
10169 {
10170         switch (compress_type) {
10171         case BTRFS_COMPRESS_NONE:
10172                 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
10173         case BTRFS_COMPRESS_ZLIB:
10174                 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
10175         case BTRFS_COMPRESS_LZO:
10176                 /*
10177                  * The LZO format depends on the sector size. 64K is the maximum
10178                  * sector size that we support.
10179                  */
10180                 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
10181                         return -EINVAL;
10182                 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
10183                        (fs_info->sectorsize_bits - 12);
10184         case BTRFS_COMPRESS_ZSTD:
10185                 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
10186         default:
10187                 return -EUCLEAN;
10188         }
10189 }
10190
10191 static ssize_t btrfs_encoded_read_inline(
10192                                 struct kiocb *iocb,
10193                                 struct iov_iter *iter, u64 start,
10194                                 u64 lockend,
10195                                 struct extent_state **cached_state,
10196                                 u64 extent_start, size_t count,
10197                                 struct btrfs_ioctl_encoded_io_args *encoded,
10198                                 bool *unlocked)
10199 {
10200         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10201         struct btrfs_root *root = inode->root;
10202         struct btrfs_fs_info *fs_info = root->fs_info;
10203         struct extent_io_tree *io_tree = &inode->io_tree;
10204         struct btrfs_path *path;
10205         struct extent_buffer *leaf;
10206         struct btrfs_file_extent_item *item;
10207         u64 ram_bytes;
10208         unsigned long ptr;
10209         void *tmp;
10210         ssize_t ret;
10211
10212         path = btrfs_alloc_path();
10213         if (!path) {
10214                 ret = -ENOMEM;
10215                 goto out;
10216         }
10217         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
10218                                        extent_start, 0);
10219         if (ret) {
10220                 if (ret > 0) {
10221                         /* The extent item disappeared? */
10222                         ret = -EIO;
10223                 }
10224                 goto out;
10225         }
10226         leaf = path->nodes[0];
10227         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
10228
10229         ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
10230         ptr = btrfs_file_extent_inline_start(item);
10231
10232         encoded->len = min_t(u64, extent_start + ram_bytes,
10233                              inode->vfs_inode.i_size) - iocb->ki_pos;
10234         ret = btrfs_encoded_io_compression_from_extent(fs_info,
10235                                  btrfs_file_extent_compression(leaf, item));
10236         if (ret < 0)
10237                 goto out;
10238         encoded->compression = ret;
10239         if (encoded->compression) {
10240                 size_t inline_size;
10241
10242                 inline_size = btrfs_file_extent_inline_item_len(leaf,
10243                                                                 path->slots[0]);
10244                 if (inline_size > count) {
10245                         ret = -ENOBUFS;
10246                         goto out;
10247                 }
10248                 count = inline_size;
10249                 encoded->unencoded_len = ram_bytes;
10250                 encoded->unencoded_offset = iocb->ki_pos - extent_start;
10251         } else {
10252                 count = min_t(u64, count, encoded->len);
10253                 encoded->len = count;
10254                 encoded->unencoded_len = count;
10255                 ptr += iocb->ki_pos - extent_start;
10256         }
10257
10258         tmp = kmalloc(count, GFP_NOFS);
10259         if (!tmp) {
10260                 ret = -ENOMEM;
10261                 goto out;
10262         }
10263         read_extent_buffer(leaf, tmp, ptr, count);
10264         btrfs_release_path(path);
10265         unlock_extent_cached(io_tree, start, lockend, cached_state);
10266         btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
10267         *unlocked = true;
10268
10269         ret = copy_to_iter(tmp, count, iter);
10270         if (ret != count)
10271                 ret = -EFAULT;
10272         kfree(tmp);
10273 out:
10274         btrfs_free_path(path);
10275         return ret;
10276 }
10277
10278 struct btrfs_encoded_read_private {
10279         struct btrfs_inode *inode;
10280         u64 file_offset;
10281         wait_queue_head_t wait;
10282         atomic_t pending;
10283         blk_status_t status;
10284         bool skip_csum;
10285 };
10286
10287 static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
10288                                             struct bio *bio, int mirror_num)
10289 {
10290         struct btrfs_encoded_read_private *priv = bio->bi_private;
10291         struct btrfs_bio *bbio = btrfs_bio(bio);
10292         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10293         blk_status_t ret;
10294
10295         if (!priv->skip_csum) {
10296                 ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
10297                 if (ret)
10298                         return ret;
10299         }
10300
10301         ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
10302         if (ret) {
10303                 btrfs_bio_free_csum(bbio);
10304                 return ret;
10305         }
10306
10307         atomic_inc(&priv->pending);
10308         ret = btrfs_map_bio(fs_info, bio, mirror_num);
10309         if (ret) {
10310                 atomic_dec(&priv->pending);
10311                 btrfs_bio_free_csum(bbio);
10312         }
10313         return ret;
10314 }
10315
10316 static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
10317 {
10318         const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
10319         struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
10320         struct btrfs_inode *inode = priv->inode;
10321         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10322         u32 sectorsize = fs_info->sectorsize;
10323         struct bio_vec *bvec;
10324         struct bvec_iter_all iter_all;
10325         u64 start = priv->file_offset;
10326         u32 bio_offset = 0;
10327
10328         if (priv->skip_csum || !uptodate)
10329                 return bbio->bio.bi_status;
10330
10331         bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
10332                 unsigned int i, nr_sectors, pgoff;
10333
10334                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
10335                 pgoff = bvec->bv_offset;
10336                 for (i = 0; i < nr_sectors; i++) {
10337                         ASSERT(pgoff < PAGE_SIZE);
10338                         if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
10339                                             bvec->bv_page, pgoff, start))
10340                                 return BLK_STS_IOERR;
10341                         start += sectorsize;
10342                         bio_offset += sectorsize;
10343                         pgoff += sectorsize;
10344                 }
10345         }
10346         return BLK_STS_OK;
10347 }
10348
10349 static void btrfs_encoded_read_endio(struct bio *bio)
10350 {
10351         struct btrfs_encoded_read_private *priv = bio->bi_private;
10352         struct btrfs_bio *bbio = btrfs_bio(bio);
10353         blk_status_t status;
10354
10355         status = btrfs_encoded_read_verify_csum(bbio);
10356         if (status) {
10357                 /*
10358                  * The memory barrier implied by the atomic_dec_return() here
10359                  * pairs with the memory barrier implied by the
10360                  * atomic_dec_return() or io_wait_event() in
10361                  * btrfs_encoded_read_regular_fill_pages() to ensure that this
10362                  * write is observed before the load of status in
10363                  * btrfs_encoded_read_regular_fill_pages().
10364                  */
10365                 WRITE_ONCE(priv->status, status);
10366         }
10367         if (!atomic_dec_return(&priv->pending))
10368                 wake_up(&priv->wait);
10369         btrfs_bio_free_csum(bbio);
10370         bio_put(bio);
10371 }
10372
10373 static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
10374                                                  u64 file_offset,
10375                                                  u64 disk_bytenr,
10376                                                  u64 disk_io_size,
10377                                                  struct page **pages)
10378 {
10379         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10380         struct btrfs_encoded_read_private priv = {
10381                 .inode = inode,
10382                 .file_offset = file_offset,
10383                 .pending = ATOMIC_INIT(1),
10384                 .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
10385         };
10386         unsigned long i = 0;
10387         u64 cur = 0;
10388         int ret;
10389
10390         init_waitqueue_head(&priv.wait);
10391         /*
10392          * Submit bios for the extent, splitting due to bio or stripe limits as
10393          * necessary.
10394          */
10395         while (cur < disk_io_size) {
10396                 struct extent_map *em;
10397                 struct btrfs_io_geometry geom;
10398                 struct bio *bio = NULL;
10399                 u64 remaining;
10400
10401                 em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
10402                                          disk_io_size - cur);
10403                 if (IS_ERR(em)) {
10404                         ret = PTR_ERR(em);
10405                 } else {
10406                         ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
10407                                                     disk_bytenr + cur, &geom);
10408                         free_extent_map(em);
10409                 }
10410                 if (ret) {
10411                         WRITE_ONCE(priv.status, errno_to_blk_status(ret));
10412                         break;
10413                 }
10414                 remaining = min(geom.len, disk_io_size - cur);
10415                 while (bio || remaining) {
10416                         size_t bytes = min_t(u64, remaining, PAGE_SIZE);
10417
10418                         if (!bio) {
10419                                 bio = btrfs_bio_alloc(BIO_MAX_VECS);
10420                                 bio->bi_iter.bi_sector =
10421                                         (disk_bytenr + cur) >> SECTOR_SHIFT;
10422                                 bio->bi_end_io = btrfs_encoded_read_endio;
10423                                 bio->bi_private = &priv;
10424                                 bio->bi_opf = REQ_OP_READ;
10425                         }
10426
10427                         if (!bytes ||
10428                             bio_add_page(bio, pages[i], bytes, 0) < bytes) {
10429                                 blk_status_t status;
10430
10431                                 status = submit_encoded_read_bio(inode, bio, 0);
10432                                 if (status) {
10433                                         WRITE_ONCE(priv.status, status);
10434                                         bio_put(bio);
10435                                         goto out;
10436                                 }
10437                                 bio = NULL;
10438                                 continue;
10439                         }
10440
10441                         i++;
10442                         cur += bytes;
10443                         remaining -= bytes;
10444                 }
10445         }
10446
10447 out:
10448         if (atomic_dec_return(&priv.pending))
10449                 io_wait_event(priv.wait, !atomic_read(&priv.pending));
10450         /* See btrfs_encoded_read_endio() for ordering. */
10451         return blk_status_to_errno(READ_ONCE(priv.status));
10452 }
10453
10454 static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
10455                                           struct iov_iter *iter,
10456                                           u64 start, u64 lockend,
10457                                           struct extent_state **cached_state,
10458                                           u64 disk_bytenr, u64 disk_io_size,
10459                                           size_t count, bool compressed,
10460                                           bool *unlocked)
10461 {
10462         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10463         struct extent_io_tree *io_tree = &inode->io_tree;
10464         struct page **pages;
10465         unsigned long nr_pages, i;
10466         u64 cur;
10467         size_t page_offset;
10468         ssize_t ret;
10469
10470         nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
10471         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
10472         if (!pages)
10473                 return -ENOMEM;
10474         for (i = 0; i < nr_pages; i++) {
10475                 pages[i] = alloc_page(GFP_NOFS);
10476                 if (!pages[i]) {
10477                         ret = -ENOMEM;
10478                         goto out;
10479                 }
10480         }
10481
10482         ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10483                                                     disk_io_size, pages);
10484         if (ret)
10485                 goto out;
10486
10487         unlock_extent_cached(io_tree, start, lockend, cached_state);
10488         btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
10489         *unlocked = true;
10490
10491         if (compressed) {
10492                 i = 0;
10493                 page_offset = 0;
10494         } else {
10495                 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10496                 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10497         }
10498         cur = 0;
10499         while (cur < count) {
10500                 size_t bytes = min_t(size_t, count - cur,
10501                                      PAGE_SIZE - page_offset);
10502
10503                 if (copy_page_to_iter(pages[i], page_offset, bytes,
10504                                       iter) != bytes) {
10505                         ret = -EFAULT;
10506                         goto out;
10507                 }
10508                 i++;
10509                 cur += bytes;
10510                 page_offset = 0;
10511         }
10512         ret = count;
10513 out:
10514         for (i = 0; i < nr_pages; i++) {
10515                 if (pages[i])
10516                         __free_page(pages[i]);
10517         }
10518         kfree(pages);
10519         return ret;
10520 }
10521
10522 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10523                            struct btrfs_ioctl_encoded_io_args *encoded)
10524 {
10525         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10526         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10527         struct extent_io_tree *io_tree = &inode->io_tree;
10528         ssize_t ret;
10529         size_t count = iov_iter_count(iter);
10530         u64 start, lockend, disk_bytenr, disk_io_size;
10531         struct extent_state *cached_state = NULL;
10532         struct extent_map *em;
10533         bool unlocked = false;
10534
10535         file_accessed(iocb->ki_filp);
10536
10537         btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
10538
10539         if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10540                 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
10541                 return 0;
10542         }
10543         start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10544         /*
10545          * We don't know how long the extent containing iocb->ki_pos is, but if
10546          * it's compressed we know that it won't be longer than this.
10547          */
10548         lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10549
10550         for (;;) {
10551                 struct btrfs_ordered_extent *ordered;
10552
10553                 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10554                                                lockend - start + 1);
10555                 if (ret)
10556                         goto out_unlock_inode;
10557                 lock_extent_bits(io_tree, start, lockend, &cached_state);
10558                 ordered = btrfs_lookup_ordered_range(inode, start,
10559                                                      lockend - start + 1);
10560                 if (!ordered)
10561                         break;
10562                 btrfs_put_ordered_extent(ordered);
10563                 unlock_extent_cached(io_tree, start, lockend, &cached_state);
10564                 cond_resched();
10565         }
10566
10567         em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10568         if (IS_ERR(em)) {
10569                 ret = PTR_ERR(em);
10570                 goto out_unlock_extent;
10571         }
10572
10573         if (em->block_start == EXTENT_MAP_INLINE) {
10574                 u64 extent_start = em->start;
10575
10576                 /*
10577                  * For inline extents we get everything we need out of the
10578                  * extent item.
10579                  */
10580                 free_extent_map(em);
10581                 em = NULL;
10582                 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10583                                                 &cached_state, extent_start,
10584                                                 count, encoded, &unlocked);
10585                 goto out;
10586         }
10587
10588         /*
10589          * We only want to return up to EOF even if the extent extends beyond
10590          * that.
10591          */
10592         encoded->len = min_t(u64, extent_map_end(em),
10593                              inode->vfs_inode.i_size) - iocb->ki_pos;
10594         if (em->block_start == EXTENT_MAP_HOLE ||
10595             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10596                 disk_bytenr = EXTENT_MAP_HOLE;
10597                 count = min_t(u64, count, encoded->len);
10598                 encoded->len = count;
10599                 encoded->unencoded_len = count;
10600         } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10601                 disk_bytenr = em->block_start;
10602                 /*
10603                  * Bail if the buffer isn't large enough to return the whole
10604                  * compressed extent.
10605                  */
10606                 if (em->block_len > count) {
10607                         ret = -ENOBUFS;
10608                         goto out_em;
10609                 }
10610                 disk_io_size = count = em->block_len;
10611                 encoded->unencoded_len = em->ram_bytes;
10612                 encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10613                 ret = btrfs_encoded_io_compression_from_extent(fs_info,
10614                                                              em->compress_type);
10615                 if (ret < 0)
10616                         goto out_em;
10617                 encoded->compression = ret;
10618         } else {
10619                 disk_bytenr = em->block_start + (start - em->start);
10620                 if (encoded->len > count)
10621                         encoded->len = count;
10622                 /*
10623                  * Don't read beyond what we locked. This also limits the page
10624                  * allocations that we'll do.
10625                  */
10626                 disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10627                 count = start + disk_io_size - iocb->ki_pos;
10628                 encoded->len = count;
10629                 encoded->unencoded_len = count;
10630                 disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10631         }
10632         free_extent_map(em);
10633         em = NULL;
10634
10635         if (disk_bytenr == EXTENT_MAP_HOLE) {
10636                 unlock_extent_cached(io_tree, start, lockend, &cached_state);
10637                 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
10638                 unlocked = true;
10639                 ret = iov_iter_zero(count, iter);
10640                 if (ret != count)
10641                         ret = -EFAULT;
10642         } else {
10643                 ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10644                                                  &cached_state, disk_bytenr,
10645                                                  disk_io_size, count,
10646                                                  encoded->compression,
10647                                                  &unlocked);
10648         }
10649
10650 out:
10651         if (ret >= 0)
10652                 iocb->ki_pos += encoded->len;
10653 out_em:
10654         free_extent_map(em);
10655 out_unlock_extent:
10656         if (!unlocked)
10657                 unlock_extent_cached(io_tree, start, lockend, &cached_state);
10658 out_unlock_inode:
10659         if (!unlocked)
10660                 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
10661         return ret;
10662 }
10663
10664 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10665                                const struct btrfs_ioctl_encoded_io_args *encoded)
10666 {
10667         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10668         struct btrfs_root *root = inode->root;
10669         struct btrfs_fs_info *fs_info = root->fs_info;
10670         struct extent_io_tree *io_tree = &inode->io_tree;
10671         struct extent_changeset *data_reserved = NULL;
10672         struct extent_state *cached_state = NULL;
10673         int compression;
10674         size_t orig_count;
10675         u64 start, end;
10676         u64 num_bytes, ram_bytes, disk_num_bytes;
10677         unsigned long nr_pages, i;
10678         struct page **pages;
10679         struct btrfs_key ins;
10680         bool extent_reserved = false;
10681         struct extent_map *em;
10682         ssize_t ret;
10683
10684         switch (encoded->compression) {
10685         case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10686                 compression = BTRFS_COMPRESS_ZLIB;
10687                 break;
10688         case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10689                 compression = BTRFS_COMPRESS_ZSTD;
10690                 break;
10691         case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10692         case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10693         case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10694         case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10695         case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10696                 /* The sector size must match for LZO. */
10697                 if (encoded->compression -
10698                     BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10699                     fs_info->sectorsize_bits)
10700                         return -EINVAL;
10701                 compression = BTRFS_COMPRESS_LZO;
10702                 break;
10703         default:
10704                 return -EINVAL;
10705         }
10706         if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10707                 return -EINVAL;
10708
10709         orig_count = iov_iter_count(from);
10710
10711         /* The extent size must be sane. */
10712         if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10713             orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10714                 return -EINVAL;
10715
10716         /*
10717          * The compressed data must be smaller than the decompressed data.
10718          *
10719          * It's of course possible for data to compress to larger or the same
10720          * size, but the buffered I/O path falls back to no compression for such
10721          * data, and we don't want to break any assumptions by creating these
10722          * extents.
10723          *
10724          * Note that this is less strict than the current check we have that the
10725          * compressed data must be at least one sector smaller than the
10726          * decompressed data. We only want to enforce the weaker requirement
10727          * from old kernels that it is at least one byte smaller.
10728          */
10729         if (orig_count >= encoded->unencoded_len)
10730                 return -EINVAL;
10731
10732         /* The extent must start on a sector boundary. */
10733         start = iocb->ki_pos;
10734         if (!IS_ALIGNED(start, fs_info->sectorsize))
10735                 return -EINVAL;
10736
10737         /*
10738          * The extent must end on a sector boundary. However, we allow a write
10739          * which ends at or extends i_size to have an unaligned length; we round
10740          * up the extent size and set i_size to the unaligned end.
10741          */
10742         if (start + encoded->len < inode->vfs_inode.i_size &&
10743             !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10744                 return -EINVAL;
10745
10746         /* Finally, the offset in the unencoded data must be sector-aligned. */
10747         if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10748                 return -EINVAL;
10749
10750         num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10751         ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10752         end = start + num_bytes - 1;
10753
10754         /*
10755          * If the extent cannot be inline, the compressed data on disk must be
10756          * sector-aligned. For convenience, we extend it with zeroes if it
10757          * isn't.
10758          */
10759         disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10760         nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10761         pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10762         if (!pages)
10763                 return -ENOMEM;
10764         for (i = 0; i < nr_pages; i++) {
10765                 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10766                 char *kaddr;
10767
10768                 pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10769                 if (!pages[i]) {
10770                         ret = -ENOMEM;
10771                         goto out_pages;
10772                 }
10773                 kaddr = kmap(pages[i]);
10774                 if (copy_from_iter(kaddr, bytes, from) != bytes) {
10775                         kunmap(pages[i]);
10776                         ret = -EFAULT;
10777                         goto out_pages;
10778                 }
10779                 if (bytes < PAGE_SIZE)
10780                         memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10781                 kunmap(pages[i]);
10782         }
10783
10784         for (;;) {
10785                 struct btrfs_ordered_extent *ordered;
10786
10787                 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10788                 if (ret)
10789                         goto out_pages;
10790                 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10791                                                     start >> PAGE_SHIFT,
10792                                                     end >> PAGE_SHIFT);
10793                 if (ret)
10794                         goto out_pages;
10795                 lock_extent_bits(io_tree, start, end, &cached_state);
10796                 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10797                 if (!ordered &&
10798                     !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10799                         break;
10800                 if (ordered)
10801                         btrfs_put_ordered_extent(ordered);
10802                 unlock_extent_cached(io_tree, start, end, &cached_state);
10803                 cond_resched();
10804         }
10805
10806         /*
10807          * We don't use the higher-level delalloc space functions because our
10808          * num_bytes and disk_num_bytes are different.
10809          */
10810         ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10811         if (ret)
10812                 goto out_unlock;
10813         ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10814         if (ret)
10815                 goto out_free_data_space;
10816         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
10817         if (ret)
10818                 goto out_qgroup_free_data;
10819
10820         /* Try an inline extent first. */
10821         if (start == 0 && encoded->unencoded_len == encoded->len &&
10822             encoded->unencoded_offset == 0) {
10823                 ret = cow_file_range_inline(inode, encoded->len, orig_count,
10824                                             compression, pages, true);
10825                 if (ret <= 0) {
10826                         if (ret == 0)
10827                                 ret = orig_count;
10828                         goto out_delalloc_release;
10829                 }
10830         }
10831
10832         ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10833                                    disk_num_bytes, 0, 0, &ins, 1, 1);
10834         if (ret)
10835                 goto out_delalloc_release;
10836         extent_reserved = true;
10837
10838         em = create_io_em(inode, start, num_bytes,
10839                           start - encoded->unencoded_offset, ins.objectid,
10840                           ins.offset, ins.offset, ram_bytes, compression,
10841                           BTRFS_ORDERED_COMPRESSED);
10842         if (IS_ERR(em)) {
10843                 ret = PTR_ERR(em);
10844                 goto out_free_reserved;
10845         }
10846         free_extent_map(em);
10847
10848         ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
10849                                        ins.objectid, ins.offset,
10850                                        encoded->unencoded_offset,
10851                                        (1 << BTRFS_ORDERED_ENCODED) |
10852                                        (1 << BTRFS_ORDERED_COMPRESSED),
10853                                        compression);
10854         if (ret) {
10855                 btrfs_drop_extent_cache(inode, start, end, 0);
10856                 goto out_free_reserved;
10857         }
10858         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10859
10860         if (start + encoded->len > inode->vfs_inode.i_size)
10861                 i_size_write(&inode->vfs_inode, start + encoded->len);
10862
10863         unlock_extent_cached(io_tree, start, end, &cached_state);
10864
10865         btrfs_delalloc_release_extents(inode, num_bytes);
10866
10867         if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
10868                                           ins.offset, pages, nr_pages, 0, NULL,
10869                                           false)) {
10870                 btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
10871                 ret = -EIO;
10872                 goto out_pages;
10873         }
10874         ret = orig_count;
10875         goto out;
10876
10877 out_free_reserved:
10878         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10879         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10880 out_delalloc_release:
10881         btrfs_delalloc_release_extents(inode, num_bytes);
10882         btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10883 out_qgroup_free_data:
10884         if (ret < 0)
10885                 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
10886 out_free_data_space:
10887         /*
10888          * If btrfs_reserve_extent() succeeded, then we already decremented
10889          * bytes_may_use.
10890          */
10891         if (!extent_reserved)
10892                 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10893 out_unlock:
10894         unlock_extent_cached(io_tree, start, end, &cached_state);
10895 out_pages:
10896         for (i = 0; i < nr_pages; i++) {
10897                 if (pages[i])
10898                         __free_page(pages[i]);
10899         }
10900         kvfree(pages);
10901 out:
10902         if (ret >= 0)
10903                 iocb->ki_pos += encoded->len;
10904         return ret;
10905 }
10906
10907 #ifdef CONFIG_SWAP
10908 /*
10909  * Add an entry indicating a block group or device which is pinned by a
10910  * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10911  * negative errno on failure.
10912  */
10913 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10914                                   bool is_block_group)
10915 {
10916         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10917         struct btrfs_swapfile_pin *sp, *entry;
10918         struct rb_node **p;
10919         struct rb_node *parent = NULL;
10920
10921         sp = kmalloc(sizeof(*sp), GFP_NOFS);
10922         if (!sp)
10923                 return -ENOMEM;
10924         sp->ptr = ptr;
10925         sp->inode = inode;
10926         sp->is_block_group = is_block_group;
10927         sp->bg_extent_count = 1;
10928
10929         spin_lock(&fs_info->swapfile_pins_lock);
10930         p = &fs_info->swapfile_pins.rb_node;
10931         while (*p) {
10932                 parent = *p;
10933                 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10934                 if (sp->ptr < entry->ptr ||
10935                     (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10936                         p = &(*p)->rb_left;
10937                 } else if (sp->ptr > entry->ptr ||
10938                            (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10939                         p = &(*p)->rb_right;
10940                 } else {
10941                         if (is_block_group)
10942                                 entry->bg_extent_count++;
10943                         spin_unlock(&fs_info->swapfile_pins_lock);
10944                         kfree(sp);
10945                         return 1;
10946                 }
10947         }
10948         rb_link_node(&sp->node, parent, p);
10949         rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10950         spin_unlock(&fs_info->swapfile_pins_lock);
10951         return 0;
10952 }
10953
10954 /* Free all of the entries pinned by this swapfile. */
10955 static void btrfs_free_swapfile_pins(struct inode *inode)
10956 {
10957         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10958         struct btrfs_swapfile_pin *sp;
10959         struct rb_node *node, *next;
10960
10961         spin_lock(&fs_info->swapfile_pins_lock);
10962         node = rb_first(&fs_info->swapfile_pins);
10963         while (node) {
10964                 next = rb_next(node);
10965                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10966                 if (sp->inode == inode) {
10967                         rb_erase(&sp->node, &fs_info->swapfile_pins);
10968                         if (sp->is_block_group) {
10969                                 btrfs_dec_block_group_swap_extents(sp->ptr,
10970                                                            sp->bg_extent_count);
10971                                 btrfs_put_block_group(sp->ptr);
10972                         }
10973                         kfree(sp);
10974                 }
10975                 node = next;
10976         }
10977         spin_unlock(&fs_info->swapfile_pins_lock);
10978 }
10979
10980 struct btrfs_swap_info {
10981         u64 start;
10982         u64 block_start;
10983         u64 block_len;
10984         u64 lowest_ppage;
10985         u64 highest_ppage;
10986         unsigned long nr_pages;
10987         int nr_extents;
10988 };
10989
10990 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10991                                  struct btrfs_swap_info *bsi)
10992 {
10993         unsigned long nr_pages;
10994         unsigned long max_pages;
10995         u64 first_ppage, first_ppage_reported, next_ppage;
10996         int ret;
10997
10998         /*
10999          * Our swapfile may have had its size extended after the swap header was
11000          * written. In that case activating the swapfile should not go beyond
11001          * the max size set in the swap header.
11002          */
11003         if (bsi->nr_pages >= sis->max)
11004                 return 0;
11005
11006         max_pages = sis->max - bsi->nr_pages;
11007         first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
11008         next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
11009                                 PAGE_SIZE) >> PAGE_SHIFT;
11010
11011         if (first_ppage >= next_ppage)
11012                 return 0;
11013         nr_pages = next_ppage - first_ppage;
11014         nr_pages = min(nr_pages, max_pages);
11015
11016         first_ppage_reported = first_ppage;
11017         if (bsi->start == 0)
11018                 first_ppage_reported++;
11019         if (bsi->lowest_ppage > first_ppage_reported)
11020                 bsi->lowest_ppage = first_ppage_reported;
11021         if (bsi->highest_ppage < (next_ppage - 1))
11022                 bsi->highest_ppage = next_ppage - 1;
11023
11024         ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
11025         if (ret < 0)
11026                 return ret;
11027         bsi->nr_extents += ret;
11028         bsi->nr_pages += nr_pages;
11029         return 0;
11030 }
11031
11032 static void btrfs_swap_deactivate(struct file *file)
11033 {
11034         struct inode *inode = file_inode(file);
11035
11036         btrfs_free_swapfile_pins(inode);
11037         atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
11038 }
11039
11040 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
11041                                sector_t *span)
11042 {
11043         struct inode *inode = file_inode(file);
11044         struct btrfs_root *root = BTRFS_I(inode)->root;
11045         struct btrfs_fs_info *fs_info = root->fs_info;
11046         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
11047         struct extent_state *cached_state = NULL;
11048         struct extent_map *em = NULL;
11049         struct btrfs_device *device = NULL;
11050         struct btrfs_swap_info bsi = {
11051                 .lowest_ppage = (sector_t)-1ULL,
11052         };
11053         int ret = 0;
11054         u64 isize;
11055         u64 start;
11056
11057         /*
11058          * If the swap file was just created, make sure delalloc is done. If the
11059          * file changes again after this, the user is doing something stupid and
11060          * we don't really care.
11061          */
11062         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
11063         if (ret)
11064                 return ret;
11065
11066         /*
11067          * The inode is locked, so these flags won't change after we check them.
11068          */
11069         if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
11070                 btrfs_warn(fs_info, "swapfile must not be compressed");
11071                 return -EINVAL;
11072         }
11073         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
11074                 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
11075                 return -EINVAL;
11076         }
11077         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
11078                 btrfs_warn(fs_info, "swapfile must not be checksummed");
11079                 return -EINVAL;
11080         }
11081
11082         /*
11083          * Balance or device remove/replace/resize can move stuff around from
11084          * under us. The exclop protection makes sure they aren't running/won't
11085          * run concurrently while we are mapping the swap extents, and
11086          * fs_info->swapfile_pins prevents them from running while the swap
11087          * file is active and moving the extents. Note that this also prevents
11088          * a concurrent device add which isn't actually necessary, but it's not
11089          * really worth the trouble to allow it.
11090          */
11091         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
11092                 btrfs_warn(fs_info,
11093            "cannot activate swapfile while exclusive operation is running");
11094                 return -EBUSY;
11095         }
11096
11097         /*
11098          * Prevent snapshot creation while we are activating the swap file.
11099          * We do not want to race with snapshot creation. If snapshot creation
11100          * already started before we bumped nr_swapfiles from 0 to 1 and
11101          * completes before the first write into the swap file after it is
11102          * activated, than that write would fallback to COW.
11103          */
11104         if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
11105                 btrfs_exclop_finish(fs_info);
11106                 btrfs_warn(fs_info,
11107            "cannot activate swapfile because snapshot creation is in progress");
11108                 return -EINVAL;
11109         }
11110         /*
11111          * Snapshots can create extents which require COW even if NODATACOW is
11112          * set. We use this counter to prevent snapshots. We must increment it
11113          * before walking the extents because we don't want a concurrent
11114          * snapshot to run after we've already checked the extents.
11115          *
11116          * It is possible that subvolume is marked for deletion but still not
11117          * removed yet. To prevent this race, we check the root status before
11118          * activating the swapfile.
11119          */
11120         spin_lock(&root->root_item_lock);
11121         if (btrfs_root_dead(root)) {
11122                 spin_unlock(&root->root_item_lock);
11123
11124                 btrfs_exclop_finish(fs_info);
11125                 btrfs_warn(fs_info,
11126                 "cannot activate swapfile because subvolume %llu is being deleted",
11127                         root->root_key.objectid);
11128                 return -EPERM;
11129         }
11130         atomic_inc(&root->nr_swapfiles);
11131         spin_unlock(&root->root_item_lock);
11132
11133         isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
11134
11135         lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
11136         start = 0;
11137         while (start < isize) {
11138                 u64 logical_block_start, physical_block_start;
11139                 struct btrfs_block_group *bg;
11140                 u64 len = isize - start;
11141
11142                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
11143                 if (IS_ERR(em)) {
11144                         ret = PTR_ERR(em);
11145                         goto out;
11146                 }
11147
11148                 if (em->block_start == EXTENT_MAP_HOLE) {
11149                         btrfs_warn(fs_info, "swapfile must not have holes");
11150                         ret = -EINVAL;
11151                         goto out;
11152                 }
11153                 if (em->block_start == EXTENT_MAP_INLINE) {
11154                         /*
11155                          * It's unlikely we'll ever actually find ourselves
11156                          * here, as a file small enough to fit inline won't be
11157                          * big enough to store more than the swap header, but in
11158                          * case something changes in the future, let's catch it
11159                          * here rather than later.
11160                          */
11161                         btrfs_warn(fs_info, "swapfile must not be inline");
11162                         ret = -EINVAL;
11163                         goto out;
11164                 }
11165                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
11166                         btrfs_warn(fs_info, "swapfile must not be compressed");
11167                         ret = -EINVAL;
11168                         goto out;
11169                 }
11170
11171                 logical_block_start = em->block_start + (start - em->start);
11172                 len = min(len, em->len - (start - em->start));
11173                 free_extent_map(em);
11174                 em = NULL;
11175
11176                 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
11177                 if (ret < 0) {
11178                         goto out;
11179                 } else if (ret) {
11180                         ret = 0;
11181                 } else {
11182                         btrfs_warn(fs_info,
11183                                    "swapfile must not be copy-on-write");
11184                         ret = -EINVAL;
11185                         goto out;
11186                 }
11187
11188                 em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
11189                 if (IS_ERR(em)) {
11190                         ret = PTR_ERR(em);
11191                         goto out;
11192                 }
11193
11194                 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
11195                         btrfs_warn(fs_info,
11196                                    "swapfile must have single data profile");
11197                         ret = -EINVAL;
11198                         goto out;
11199                 }
11200
11201                 if (device == NULL) {
11202                         device = em->map_lookup->stripes[0].dev;
11203                         ret = btrfs_add_swapfile_pin(inode, device, false);
11204                         if (ret == 1)
11205                                 ret = 0;
11206                         else if (ret)
11207                                 goto out;
11208                 } else if (device != em->map_lookup->stripes[0].dev) {
11209                         btrfs_warn(fs_info, "swapfile must be on one device");
11210                         ret = -EINVAL;
11211                         goto out;
11212                 }
11213
11214                 physical_block_start = (em->map_lookup->stripes[0].physical +
11215                                         (logical_block_start - em->start));
11216                 len = min(len, em->len - (logical_block_start - em->start));
11217                 free_extent_map(em);
11218                 em = NULL;
11219
11220                 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
11221                 if (!bg) {
11222                         btrfs_warn(fs_info,
11223                            "could not find block group containing swapfile");
11224                         ret = -EINVAL;
11225                         goto out;
11226                 }
11227
11228                 if (!btrfs_inc_block_group_swap_extents(bg)) {
11229                         btrfs_warn(fs_info,
11230                            "block group for swapfile at %llu is read-only%s",
11231                            bg->start,
11232                            atomic_read(&fs_info->scrubs_running) ?
11233                                        " (scrub running)" : "");
11234                         btrfs_put_block_group(bg);
11235                         ret = -EINVAL;
11236                         goto out;
11237                 }
11238
11239                 ret = btrfs_add_swapfile_pin(inode, bg, true);
11240                 if (ret) {
11241                         btrfs_put_block_group(bg);
11242                         if (ret == 1)
11243                                 ret = 0;
11244                         else
11245                                 goto out;
11246                 }
11247
11248                 if (bsi.block_len &&
11249                     bsi.block_start + bsi.block_len == physical_block_start) {
11250                         bsi.block_len += len;
11251                 } else {
11252                         if (bsi.block_len) {
11253                                 ret = btrfs_add_swap_extent(sis, &bsi);
11254                                 if (ret)
11255                                         goto out;
11256                         }
11257                         bsi.start = start;
11258                         bsi.block_start = physical_block_start;
11259                         bsi.block_len = len;
11260                 }
11261
11262                 start += len;
11263         }
11264
11265         if (bsi.block_len)
11266                 ret = btrfs_add_swap_extent(sis, &bsi);
11267
11268 out:
11269         if (!IS_ERR_OR_NULL(em))
11270                 free_extent_map(em);
11271
11272         unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
11273
11274         if (ret)
11275                 btrfs_swap_deactivate(file);
11276
11277         btrfs_drew_write_unlock(&root->snapshot_lock);
11278
11279         btrfs_exclop_finish(fs_info);
11280
11281         if (ret)
11282                 return ret;
11283
11284         if (device)
11285                 sis->bdev = device->bdev;
11286         *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
11287         sis->max = bsi.nr_pages;
11288         sis->pages = bsi.nr_pages - 1;
11289         sis->highest_bit = bsi.nr_pages - 1;
11290         return bsi.nr_extents;
11291 }
11292 #else
11293 static void btrfs_swap_deactivate(struct file *file)
11294 {
11295 }
11296
11297 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
11298                                sector_t *span)
11299 {
11300         return -EOPNOTSUPP;
11301 }
11302 #endif
11303
11304 /*
11305  * Update the number of bytes used in the VFS' inode. When we replace extents in
11306  * a range (clone, dedupe, fallocate's zero range), we must update the number of
11307  * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
11308  * always get a correct value.
11309  */
11310 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
11311                               const u64 add_bytes,
11312                               const u64 del_bytes)
11313 {
11314         if (add_bytes == del_bytes)
11315                 return;
11316
11317         spin_lock(&inode->lock);
11318         if (del_bytes > 0)
11319                 inode_sub_bytes(&inode->vfs_inode, del_bytes);
11320         if (add_bytes > 0)
11321                 inode_add_bytes(&inode->vfs_inode, add_bytes);
11322         spin_unlock(&inode->lock);
11323 }
11324
11325 static const struct inode_operations btrfs_dir_inode_operations = {
11326         .getattr        = btrfs_getattr,
11327         .lookup         = btrfs_lookup,
11328         .create         = btrfs_create,
11329         .unlink         = btrfs_unlink,
11330         .link           = btrfs_link,
11331         .mkdir          = btrfs_mkdir,
11332         .rmdir          = btrfs_rmdir,
11333         .rename         = btrfs_rename2,
11334         .symlink        = btrfs_symlink,
11335         .setattr        = btrfs_setattr,
11336         .mknod          = btrfs_mknod,
11337         .listxattr      = btrfs_listxattr,
11338         .permission     = btrfs_permission,
11339         .get_acl        = btrfs_get_acl,
11340         .set_acl        = btrfs_set_acl,
11341         .update_time    = btrfs_update_time,
11342         .tmpfile        = btrfs_tmpfile,
11343         .fileattr_get   = btrfs_fileattr_get,
11344         .fileattr_set   = btrfs_fileattr_set,
11345 };
11346
11347 static const struct file_operations btrfs_dir_file_operations = {
11348         .llseek         = generic_file_llseek,
11349         .read           = generic_read_dir,
11350         .iterate_shared = btrfs_real_readdir,
11351         .open           = btrfs_opendir,
11352         .unlocked_ioctl = btrfs_ioctl,
11353 #ifdef CONFIG_COMPAT
11354         .compat_ioctl   = btrfs_compat_ioctl,
11355 #endif
11356         .release        = btrfs_release_file,
11357         .fsync          = btrfs_sync_file,
11358 };
11359
11360 /*
11361  * btrfs doesn't support the bmap operation because swapfiles
11362  * use bmap to make a mapping of extents in the file.  They assume
11363  * these extents won't change over the life of the file and they
11364  * use the bmap result to do IO directly to the drive.
11365  *
11366  * the btrfs bmap call would return logical addresses that aren't
11367  * suitable for IO and they also will change frequently as COW
11368  * operations happen.  So, swapfile + btrfs == corruption.
11369  *
11370  * For now we're avoiding this by dropping bmap.
11371  */
11372 static const struct address_space_operations btrfs_aops = {
11373         .readpage       = btrfs_readpage,
11374         .writepage      = btrfs_writepage,
11375         .writepages     = btrfs_writepages,
11376         .readahead      = btrfs_readahead,
11377         .direct_IO      = noop_direct_IO,
11378         .invalidate_folio = btrfs_invalidate_folio,
11379         .releasepage    = btrfs_releasepage,
11380 #ifdef CONFIG_MIGRATION
11381         .migratepage    = btrfs_migratepage,
11382 #endif
11383         .dirty_folio    = filemap_dirty_folio,
11384         .error_remove_page = generic_error_remove_page,
11385         .swap_activate  = btrfs_swap_activate,
11386         .swap_deactivate = btrfs_swap_deactivate,
11387 };
11388
11389 static const struct inode_operations btrfs_file_inode_operations = {
11390         .getattr        = btrfs_getattr,
11391         .setattr        = btrfs_setattr,
11392         .listxattr      = btrfs_listxattr,
11393         .permission     = btrfs_permission,
11394         .fiemap         = btrfs_fiemap,
11395         .get_acl        = btrfs_get_acl,
11396         .set_acl        = btrfs_set_acl,
11397         .update_time    = btrfs_update_time,
11398         .fileattr_get   = btrfs_fileattr_get,
11399         .fileattr_set   = btrfs_fileattr_set,
11400 };
11401 static const struct inode_operations btrfs_special_inode_operations = {
11402         .getattr        = btrfs_getattr,
11403         .setattr        = btrfs_setattr,
11404         .permission     = btrfs_permission,
11405         .listxattr      = btrfs_listxattr,
11406         .get_acl        = btrfs_get_acl,
11407         .set_acl        = btrfs_set_acl,
11408         .update_time    = btrfs_update_time,
11409 };
11410 static const struct inode_operations btrfs_symlink_inode_operations = {
11411         .get_link       = page_get_link,
11412         .getattr        = btrfs_getattr,
11413         .setattr        = btrfs_setattr,
11414         .permission     = btrfs_permission,
11415         .listxattr      = btrfs_listxattr,
11416         .update_time    = btrfs_update_time,
11417 };
11418
11419 const struct dentry_operations btrfs_dentry_operations = {
11420         .d_delete       = btrfs_dentry_delete,
11421 };