fs/btrfs/tree-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2008 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/slab.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/list_sort.h>
  10 #include <linux/iversion.h>
  11 #include "misc.h"
  12 #include "ctree.h"
  13 #include "tree-log.h"
  14 #include "disk-io.h"
  15 #include "locking.h"
  16 #include "print-tree.h"
  17 #include "backref.h"
  18 #include "compression.h"
  19 #include "qgroup.h"
  20 #include "block-group.h"
  21 #include "space-info.h"
  22 #include "zoned.h"
  23
  24 /* magic values for the inode_only field in btrfs_log_inode:
  25  *
  26  * LOG_INODE_ALL means to log everything
  27  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  28  * during log replay
  29  */
  30 enum {
  31         LOG_INODE_ALL,
  32         LOG_INODE_EXISTS,
  33         LOG_OTHER_INODE,
  34         LOG_OTHER_INODE_ALL,
  35 };
  36
  37 /*
  38  * directory trouble cases
  39  *
  40  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  41  * log, we must force a full commit before doing an fsync of the directory
  42  * where the unlink was done.
  43  * ---> record transid of last unlink/rename per directory
  44  *
  45  * mkdir foo/some_dir
  46  * normal commit
  47  * rename foo/some_dir foo2/some_dir
  48  * mkdir foo/some_dir
  49  * fsync foo/some_dir/some_file
  50  *
  51  * The fsync above will unlink the original some_dir without recording
  52  * it in its new location (foo2).  After a crash, some_dir will be gone
  53  * unless the fsync of some_file forces a full commit
  54  *
  55  * 2) we must log any new names for any file or dir that is in the fsync
  56  * log. ---> check inode while renaming/linking.
  57  *
  58  * 2a) we must log any new names for any file or dir during rename
  59  * when the directory they are being removed from was logged.
  60  * ---> check inode and old parent dir during rename
  61  *
  62  *  2a is actually the more important variant.  With the extra logging
  63  *  a crash might unlink the old name without recreating the new one
  64  *
  65  * 3) after a crash, we must go through any directories with a link count
  66  * of zero and redo the rm -rf
  67  *
  68  * mkdir f1/foo
  69  * normal commit
  70  * rm -rf f1/foo
  71  * fsync(f1)
  72  *
  73  * The directory f1 was fully removed from the FS, but fsync was never
  74  * called on f1, only its parent dir.  After a crash the rm -rf must
  75  * be replayed.  This must be able to recurse down the entire
  76  * directory tree.  The inode link count fixup code takes care of the
  77  * ugly details.
  78  */
  79
  80 /*
  81  * stages for the tree walking.  The first
  82  * stage (0) is to only pin down the blocks we find
  83  * the second stage (1) is to make sure that all the inodes
  84  * we find in the log are created in the subvolume.
  85  *
  86  * The last stage is to deal with directories and links and extents
  87  * and all the other fun semantics
  88  */
  89 enum {
  90         LOG_WALK_PIN_ONLY,
  91         LOG_WALK_REPLAY_INODES,
  92         LOG_WALK_REPLAY_DIR_INDEX,
  93         LOG_WALK_REPLAY_ALL,
  94 };
  95
  96 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  97                            struct btrfs_root *root, struct btrfs_inode *inode,
  98                            int inode_only,
  99                            struct btrfs_log_ctx *ctx);
 100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 101                              struct btrfs_root *root,
 102                              struct btrfs_path *path, u64 objectid);
 103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 104                                        struct btrfs_root *root,
 105                                        struct btrfs_root *log,
 106                                        struct btrfs_path *path,
 107                                        u64 dirid, int del_all);
 108 static void wait_log_commit(struct btrfs_root *root, int transid);
 109
 110 /*
 111  * tree logging is a special write ahead log used to make sure that
 112  * fsyncs and O_SYNCs can happen without doing full tree commits.
 113  *
 114  * Full tree commits are expensive because they require commonly
 115  * modified blocks to be recowed, creating many dirty pages in the
 116  * extent tree an 4x-6x higher write load than ext3.
 117  *
 118  * Instead of doing a tree commit on every fsync, we use the
 119  * key ranges and transaction ids to find items for a given file or directory
 120  * that have changed in this transaction.  Those items are copied into
 121  * a special tree (one per subvolume root), that tree is written to disk
 122  * and then the fsync is considered complete.
 123  *
 124  * After a crash, items are copied out of the log-tree back into the
 125  * subvolume tree.  Any file data extents found are recorded in the extent
 126  * allocation tree, and the log-tree freed.
 127  *
 128  * The log tree is read three times, once to pin down all the extents it is
 129  * using in ram and once, once to create all the inodes logged in the tree
 130  * and once to do all the other items.
 131  */
 132
 133 /*
 134  * start a sub transaction and setup the log tree
 135  * this increments the log tree writer count to make the people
 136  * syncing the tree wait for us to finish
 137  */
 138 static int start_log_trans(struct btrfs_trans_handle *trans,
 139                            struct btrfs_root *root,
 140                            struct btrfs_log_ctx *ctx)
 141 {
 142         struct btrfs_fs_info *fs_info = root->fs_info;
 143         struct btrfs_root *tree_root = fs_info->tree_root;
 144         const bool zoned = btrfs_is_zoned(fs_info);
 145         int ret = 0;
 146         bool created = false;
 147
 148         /*
 149          * First check if the log root tree was already created. If not, create
 150          * it before locking the root's log_mutex, just to keep lockdep happy.
 151          */
 152         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
 153                 mutex_lock(&tree_root->log_mutex);
 154                 if (!fs_info->log_root_tree) {
 155                         ret = btrfs_init_log_root_tree(trans, fs_info);
 156                         if (!ret) {
 157                                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
 158                                 created = true;
 159                         }
 160                 }
 161                 mutex_unlock(&tree_root->log_mutex);
 162                 if (ret)
 163                         return ret;
 164         }
 165
 166         mutex_lock(&root->log_mutex);
 167
 168 again:
 169         if (root->log_root) {
 170                 int index = (root->log_transid + 1) % 2;
 171
 172                 if (btrfs_need_log_full_commit(trans)) {
 173                         ret = -EAGAIN;
 174                         goto out;
 175                 }
 176
 177                 if (zoned && atomic_read(&root->log_commit[index])) {
 178                         wait_log_commit(root, root->log_transid - 1);
 179                         goto again;
 180                 }
 181
 182                 if (!root->log_start_pid) {
 183                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 184                         root->log_start_pid = current->pid;
 185                 } else if (root->log_start_pid != current->pid) {
 186                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 187                 }
 188         } else {
 189                 /*
 190                  * This means fs_info->log_root_tree was already created
 191                  * for some other FS trees. Do the full commit not to mix
 192                  * nodes from multiple log transactions to do sequential
 193                  * writing.
 194                  */
 195                 if (zoned && !created) {
 196                         ret = -EAGAIN;
 197                         goto out;
 198                 }
 199
 200                 ret = btrfs_add_log_tree(trans, root);
 201                 if (ret)
 202                         goto out;
 203
 204                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 205                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 206                 root->log_start_pid = current->pid;
 207         }
 208
 209         atomic_inc(&root->log_writers);
 210         if (ctx && !ctx->logging_new_name) {
 211                 int index = root->log_transid % 2;
 212                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 213                 ctx->log_transid = root->log_transid;
 214         }
 215
 216 out:
 217         mutex_unlock(&root->log_mutex);
 218         return ret;
 219 }
 220
 221 /*
 222  * returns 0 if there was a log transaction running and we were able
 223  * to join, or returns -ENOENT if there were not transactions
 224  * in progress
 225  */
 226 static int join_running_log_trans(struct btrfs_root *root)
 227 {
 228         const bool zoned = btrfs_is_zoned(root->fs_info);
 229         int ret = -ENOENT;
 230
 231         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 232                 return ret;
 233
 234         mutex_lock(&root->log_mutex);
 235 again:
 236         if (root->log_root) {
 237                 int index = (root->log_transid + 1) % 2;
 238
 239                 ret = 0;
 240                 if (zoned && atomic_read(&root->log_commit[index])) {
 241                         wait_log_commit(root, root->log_transid - 1);
 242                         goto again;
 243                 }
 244                 atomic_inc(&root->log_writers);
 245         }
 246         mutex_unlock(&root->log_mutex);
 247         return ret;
 248 }
 249
 250 /*
 251  * This either makes the current running log transaction wait
 252  * until you call btrfs_end_log_trans() or it makes any future
 253  * log transactions wait until you call btrfs_end_log_trans()
 254  */
 255 void btrfs_pin_log_trans(struct btrfs_root *root)
 256 {
 257         atomic_inc(&root->log_writers);
 258 }
 259
 260 /*
 261  * indicate we're done making changes to the log tree
 262  * and wake up anyone waiting to do a sync
 263  */
 264 void btrfs_end_log_trans(struct btrfs_root *root)
 265 {
 266         if (atomic_dec_and_test(&root->log_writers)) {
 267                 /* atomic_dec_and_test implies a barrier */
 268                 cond_wake_up_nomb(&root->log_writer_wait);
 269         }
 270 }
 271
 272 static int btrfs_write_tree_block(struct extent_buffer *buf)
 273 {
 274         return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
 275                                         buf->start + buf->len - 1);
 276 }
 277
 278 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 279 {
 280         filemap_fdatawait_range(buf->pages[0]->mapping,
 281                                 buf->start, buf->start + buf->len - 1);
 282 }
 283
 284 /*
 285  * the walk control struct is used to pass state down the chain when
 286  * processing the log tree.  The stage field tells us which part
 287  * of the log tree processing we are currently doing.  The others
 288  * are state fields used for that specific part
 289  */
 290 struct walk_control {
 291         /* should we free the extent on disk when done?  This is used
 292          * at transaction commit time while freeing a log tree
 293          */
 294         int free;
 295
 296         /* should we write out the extent buffer?  This is used
 297          * while flushing the log tree to disk during a sync
 298          */
 299         int write;
 300
 301         /* should we wait for the extent buffer io to finish?  Also used
 302          * while flushing the log tree to disk for a sync
 303          */
 304         int wait;
 305
 306         /* pin only walk, we record which extents on disk belong to the
 307          * log trees
 308          */
 309         int pin;
 310
 311         /* what stage of the replay code we're currently in */
 312         int stage;
 313
 314         /*
 315          * Ignore any items from the inode currently being processed. Needs
 316          * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
 317          * the LOG_WALK_REPLAY_INODES stage.
 318          */
 319         bool ignore_cur_inode;
 320
 321         /* the root we are currently replaying */
 322         struct btrfs_root *replay_dest;
 323
 324         /* the trans handle for the current replay */
 325         struct btrfs_trans_handle *trans;
 326
 327         /* the function that gets used to process blocks we find in the
 328          * tree.  Note the extent_buffer might not be up to date when it is
 329          * passed in, and it must be checked or read if you need the data
 330          * inside it
 331          */
 332         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 333                             struct walk_control *wc, u64 gen, int level);
 334 };
 335
 336 /*
 337  * process_func used to pin down extents, write them or wait on them
 338  */
 339 static int process_one_buffer(struct btrfs_root *log,
 340                               struct extent_buffer *eb,
 341                               struct walk_control *wc, u64 gen, int level)
 342 {
 343         struct btrfs_fs_info *fs_info = log->fs_info;
 344         int ret = 0;
 345
 346         /*
 347          * If this fs is mixed then we need to be able to process the leaves to
 348          * pin down any logged extents, so we have to read the block.
 349          */
 350         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 351                 ret = btrfs_read_buffer(eb, gen, level, NULL);
 352                 if (ret)
 353                         return ret;
 354         }
 355
 356         if (wc->pin)
 357                 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
 358                                                       eb->len);
 359
 360         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 361                 if (wc->pin && btrfs_header_level(eb) == 0)
 362                         ret = btrfs_exclude_logged_extents(eb);
 363                 if (wc->write)
 364                         btrfs_write_tree_block(eb);
 365                 if (wc->wait)
 366                         btrfs_wait_tree_block_writeback(eb);
 367         }
 368         return ret;
 369 }
 370
 371 /*
 372  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 373  * to the src data we are copying out.
 374  *
 375  * root is the tree we are copying into, and path is a scratch
 376  * path for use in this function (it should be released on entry and
 377  * will be released on exit).
 378  *
 379  * If the key is already in the destination tree the existing item is
 380  * overwritten.  If the existing item isn't big enough, it is extended.
 381  * If it is too large, it is truncated.
 382  *
 383  * If the key isn't in the destination yet, a new item is inserted.
 384  */
 385 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 386                                    struct btrfs_root *root,
 387                                    struct btrfs_path *path,
 388                                    struct extent_buffer *eb, int slot,
 389                                    struct btrfs_key *key)
 390 {
 391         int ret;
 392         u32 item_size;
 393         u64 saved_i_size = 0;
 394         int save_old_i_size = 0;
 395         unsigned long src_ptr;
 396         unsigned long dst_ptr;
 397         int overwrite_root = 0;
 398         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 399
 400         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 401                 overwrite_root = 1;
 402
 403         item_size = btrfs_item_size_nr(eb, slot);
 404         src_ptr = btrfs_item_ptr_offset(eb, slot);
 405
 406         /* look for the key in the destination tree */
 407         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 408         if (ret < 0)
 409                 return ret;
 410
 411         if (ret == 0) {
 412                 char *src_copy;
 413                 char *dst_copy;
 414                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 415                                                   path->slots[0]);
 416                 if (dst_size != item_size)
 417                         goto insert;
 418
 419                 if (item_size == 0) {
 420                         btrfs_release_path(path);
 421                         return 0;
 422                 }
 423                 dst_copy = kmalloc(item_size, GFP_NOFS);
 424                 src_copy = kmalloc(item_size, GFP_NOFS);
 425                 if (!dst_copy || !src_copy) {
 426                         btrfs_release_path(path);
 427                         kfree(dst_copy);
 428                         kfree(src_copy);
 429                         return -ENOMEM;
 430                 }
 431
 432                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 433
 434                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 435                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 436                                    item_size);
 437                 ret = memcmp(dst_copy, src_copy, item_size);
 438
 439                 kfree(dst_copy);
 440                 kfree(src_copy);
 441                 /*
 442                  * they have the same contents, just return, this saves
 443                  * us from cowing blocks in the destination tree and doing
 444                  * extra writes that may not have been done by a previous
 445                  * sync
 446                  */
 447                 if (ret == 0) {
 448                         btrfs_release_path(path);
 449                         return 0;
 450                 }
 451
 452                 /*
 453                  * We need to load the old nbytes into the inode so when we
 454                  * replay the extents we've logged we get the right nbytes.
 455                  */
 456                 if (inode_item) {
 457                         struct btrfs_inode_item *item;
 458                         u64 nbytes;
 459                         u32 mode;
 460
 461                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 462                                               struct btrfs_inode_item);
 463                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 464                         item = btrfs_item_ptr(eb, slot,
 465                                               struct btrfs_inode_item);
 466                         btrfs_set_inode_nbytes(eb, item, nbytes);
 467
 468                         /*
 469                          * If this is a directory we need to reset the i_size to
 470                          * 0 so that we can set it up properly when replaying
 471                          * the rest of the items in this log.
 472                          */
 473                         mode = btrfs_inode_mode(eb, item);
 474                         if (S_ISDIR(mode))
 475                                 btrfs_set_inode_size(eb, item, 0);
 476                 }
 477         } else if (inode_item) {
 478                 struct btrfs_inode_item *item;
 479                 u32 mode;
 480
 481                 /*
 482                  * New inode, set nbytes to 0 so that the nbytes comes out
 483                  * properly when we replay the extents.
 484                  */
 485                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 486                 btrfs_set_inode_nbytes(eb, item, 0);
 487
 488                 /*
 489                  * If this is a directory we need to reset the i_size to 0 so
 490                  * that we can set it up properly when replaying the rest of
 491                  * the items in this log.
 492                  */
 493                 mode = btrfs_inode_mode(eb, item);
 494                 if (S_ISDIR(mode))
 495                         btrfs_set_inode_size(eb, item, 0);
 496         }
 497 insert:
 498         btrfs_release_path(path);
 499         /* try to insert the key into the destination tree */
 500         path->skip_release_on_error = 1;
 501         ret = btrfs_insert_empty_item(trans, root, path,
 502                                       key, item_size);
 503         path->skip_release_on_error = 0;
 504
 505         /* make sure any existing item is the correct size */
 506         if (ret == -EEXIST || ret == -EOVERFLOW) {
 507                 u32 found_size;
 508                 found_size = btrfs_item_size_nr(path->nodes[0],
 509                                                 path->slots[0]);
 510                 if (found_size > item_size)
 511                         btrfs_truncate_item(path, item_size, 1);
 512                 else if (found_size < item_size)
 513                         btrfs_extend_item(path, item_size - found_size);
 514         } else if (ret) {
 515                 return ret;
 516         }
 517         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 518                                         path->slots[0]);
 519
 520         /* don't overwrite an existing inode if the generation number
 521          * was logged as zero.  This is done when the tree logging code
 522          * is just logging an inode to make sure it exists after recovery.
 523          *
 524          * Also, don't overwrite i_size on directories during replay.
 525          * log replay inserts and removes directory items based on the
 526          * state of the tree found in the subvolume, and i_size is modified
 527          * as it goes
 528          */
 529         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 530                 struct btrfs_inode_item *src_item;
 531                 struct btrfs_inode_item *dst_item;
 532
 533                 src_item = (struct btrfs_inode_item *)src_ptr;
 534                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 535
 536                 if (btrfs_inode_generation(eb, src_item) == 0) {
 537                         struct extent_buffer *dst_eb = path->nodes[0];
 538                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 539
 540                         /*
 541                          * For regular files an ino_size == 0 is used only when
 542                          * logging that an inode exists, as part of a directory
 543                          * fsync, and the inode wasn't fsynced before. In this
 544                          * case don't set the size of the inode in the fs/subvol
 545                          * tree, otherwise we would be throwing valid data away.
 546                          */
 547                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 548                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 549                             ino_size != 0)
 550                                 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
 551                         goto no_copy;
 552                 }
 553
 554                 if (overwrite_root &&
 555                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 556                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 557                         save_old_i_size = 1;
 558                         saved_i_size = btrfs_inode_size(path->nodes[0],
 559                                                         dst_item);
 560                 }
 561         }
 562
 563         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 564                            src_ptr, item_size);
 565
 566         if (save_old_i_size) {
 567                 struct btrfs_inode_item *dst_item;
 568                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 569                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 570         }
 571
 572         /* make sure the generation is filled in */
 573         if (key->type == BTRFS_INODE_ITEM_KEY) {
 574                 struct btrfs_inode_item *dst_item;
 575                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 576                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 577                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 578                                                    trans->transid);
 579                 }
 580         }
 581 no_copy:
 582         btrfs_mark_buffer_dirty(path->nodes[0]);
 583         btrfs_release_path(path);
 584         return 0;
 585 }
 586
 587 /*
 588  * simple helper to read an inode off the disk from a given root
 589  * This can only be called for subvolume roots and not for the log
 590  */
 591 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 592                                              u64 objectid)
 593 {
 594         struct inode *inode;
 595
 596         inode = btrfs_iget(root->fs_info->sb, objectid, root);
 597         if (IS_ERR(inode))
 598                 inode = NULL;
 599         return inode;
 600 }
 601
 602 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 603  * subvolume 'root'.  path is released on entry and should be released
 604  * on exit.
 605  *
 606  * extents in the log tree have not been allocated out of the extent
 607  * tree yet.  So, this completes the allocation, taking a reference
 608  * as required if the extent already exists or creating a new extent
 609  * if it isn't in the extent allocation tree yet.
 610  *
 611  * The extent is inserted into the file, dropping any existing extents
 612  * from the file that overlap the new one.
 613  */
 614 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 615                                       struct btrfs_root *root,
 616                                       struct btrfs_path *path,
 617                                       struct extent_buffer *eb, int slot,
 618                                       struct btrfs_key *key)
 619 {
 620         struct btrfs_drop_extents_args drop_args = { 0 };
 621         struct btrfs_fs_info *fs_info = root->fs_info;
 622         int found_type;
 623         u64 extent_end;
 624         u64 start = key->offset;
 625         u64 nbytes = 0;
 626         struct btrfs_file_extent_item *item;
 627         struct inode *inode = NULL;
 628         unsigned long size;
 629         int ret = 0;
 630
 631         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 632         found_type = btrfs_file_extent_type(eb, item);
 633
 634         if (found_type == BTRFS_FILE_EXTENT_REG ||
 635             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 636                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 637                 extent_end = start + nbytes;
 638
 639                 /*
 640                  * We don't add to the inodes nbytes if we are prealloc or a
 641                  * hole.
 642                  */
 643                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 644                         nbytes = 0;
 645         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 646                 size = btrfs_file_extent_ram_bytes(eb, item);
 647                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 648                 extent_end = ALIGN(start + size,
 649                                    fs_info->sectorsize);
 650         } else {
 651                 ret = 0;
 652                 goto out;
 653         }
 654
 655         inode = read_one_inode(root, key->objectid);
 656         if (!inode) {
 657                 ret = -EIO;
 658                 goto out;
 659         }
 660
 661         /*
 662          * first check to see if we already have this extent in the
 663          * file.  This must be done before the btrfs_drop_extents run
 664          * so we don't try to drop this extent.
 665          */
 666         ret = btrfs_lookup_file_extent(trans, root, path,
 667                         btrfs_ino(BTRFS_I(inode)), start, 0);
 668
 669         if (ret == 0 &&
 670             (found_type == BTRFS_FILE_EXTENT_REG ||
 671              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 672                 struct btrfs_file_extent_item cmp1;
 673                 struct btrfs_file_extent_item cmp2;
 674                 struct btrfs_file_extent_item *existing;
 675                 struct extent_buffer *leaf;
 676
 677                 leaf = path->nodes[0];
 678                 existing = btrfs_item_ptr(leaf, path->slots[0],
 679                                           struct btrfs_file_extent_item);
 680
 681                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 682                                    sizeof(cmp1));
 683                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 684                                    sizeof(cmp2));
 685
 686                 /*
 687                  * we already have a pointer to this exact extent,
 688                  * we don't have to do anything
 689                  */
 690                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 691                         btrfs_release_path(path);
 692                         goto out;
 693                 }
 694         }
 695         btrfs_release_path(path);
 696
 697         /* drop any overlapping extents */
 698         drop_args.start = start;
 699         drop_args.end = extent_end;
 700         drop_args.drop_cache = true;
 701         ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
 702         if (ret)
 703                 goto out;
 704
 705         if (found_type == BTRFS_FILE_EXTENT_REG ||
 706             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 707                 u64 offset;
 708                 unsigned long dest_offset;
 709                 struct btrfs_key ins;
 710
 711                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 712                     btrfs_fs_incompat(fs_info, NO_HOLES))
 713                         goto update_inode;
 714
 715                 ret = btrfs_insert_empty_item(trans, root, path, key,
 716                                               sizeof(*item));
 717                 if (ret)
 718                         goto out;
 719                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 720                                                     path->slots[0]);
 721                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 722                                 (unsigned long)item,  sizeof(*item));
 723
 724                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 725                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 726                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 727                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 728
 729                 /*
 730                  * Manually record dirty extent, as here we did a shallow
 731                  * file extent item copy and skip normal backref update,
 732                  * but modifying extent tree all by ourselves.
 733                  * So need to manually record dirty extent for qgroup,
 734                  * as the owner of the file extent changed from log tree
 735                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 736                  */
 737                 ret = btrfs_qgroup_trace_extent(trans,
 738                                 btrfs_file_extent_disk_bytenr(eb, item),
 739                                 btrfs_file_extent_disk_num_bytes(eb, item),
 740                                 GFP_NOFS);
 741                 if (ret < 0)
 742                         goto out;
 743
 744                 if (ins.objectid > 0) {
 745                         struct btrfs_ref ref = { 0 };
 746                         u64 csum_start;
 747                         u64 csum_end;
 748                         LIST_HEAD(ordered_sums);
 749
 750                         /*
 751                          * is this extent already allocated in the extent
 752                          * allocation tree?  If so, just add a reference
 753                          */
 754                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 755                                                 ins.offset);
 756                         if (ret < 0) {
 757                                 goto out;
 758                         } else if (ret == 0) {
 759                                 btrfs_init_generic_ref(&ref,
 760                                                 BTRFS_ADD_DELAYED_REF,
 761                                                 ins.objectid, ins.offset, 0);
 762                                 btrfs_init_data_ref(&ref,
 763                                                 root->root_key.objectid,
 764                                                 key->objectid, offset);
 765                                 ret = btrfs_inc_extent_ref(trans, &ref);
 766                                 if (ret)
 767                                         goto out;
 768                         } else {
 769                                 /*
 770                                  * insert the extent pointer in the extent
 771                                  * allocation tree
 772                                  */
 773                                 ret = btrfs_alloc_logged_file_extent(trans,
 774                                                 root->root_key.objectid,
 775                                                 key->objectid, offset, &ins);
 776                                 if (ret)
 777                                         goto out;
 778                         }
 779                         btrfs_release_path(path);
 780
 781                         if (btrfs_file_extent_compression(eb, item)) {
 782                                 csum_start = ins.objectid;
 783                                 csum_end = csum_start + ins.offset;
 784                         } else {
 785                                 csum_start = ins.objectid +
 786                                         btrfs_file_extent_offset(eb, item);
 787                                 csum_end = csum_start +
 788                                         btrfs_file_extent_num_bytes(eb, item);
 789                         }
 790
 791                         ret = btrfs_lookup_csums_range(root->log_root,
 792                                                 csum_start, csum_end - 1,
 793                                                 &ordered_sums, 0);
 794                         if (ret)
 795                                 goto out;
 796                         /*
 797                          * Now delete all existing cums in the csum root that
 798                          * cover our range. We do this because we can have an
 799                          * extent that is completely referenced by one file
 800                          * extent item and partially referenced by another
 801                          * file extent item (like after using the clone or
 802                          * extent_same ioctls). In this case if we end up doing
 803                          * the replay of the one that partially references the
 804                          * extent first, and we do not do the csum deletion
 805                          * below, we can get 2 csum items in the csum tree that
 806                          * overlap each other. For example, imagine our log has
 807                          * the two following file extent items:
 808                          *
 809                          * key (257 EXTENT_DATA 409600)
 810                          *     extent data disk byte 12845056 nr 102400
 811                          *     extent data offset 20480 nr 20480 ram 102400
 812                          *
 813                          * key (257 EXTENT_DATA 819200)
 814                          *     extent data disk byte 12845056 nr 102400
 815                          *     extent data offset 0 nr 102400 ram 102400
 816                          *
 817                          * Where the second one fully references the 100K extent
 818                          * that starts at disk byte 12845056, and the log tree
 819                          * has a single csum item that covers the entire range
 820                          * of the extent:
 821                          *
 822                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 823                          *
 824                          * After the first file extent item is replayed, the
 825                          * csum tree gets the following csum item:
 826                          *
 827                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 828                          *
 829                          * Which covers the 20K sub-range starting at offset 20K
 830                          * of our extent. Now when we replay the second file
 831                          * extent item, if we do not delete existing csum items
 832                          * that cover any of its blocks, we end up getting two
 833                          * csum items in our csum tree that overlap each other:
 834                          *
 835                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 836                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 837                          *
 838                          * Which is a problem, because after this anyone trying
 839                          * to lookup up for the checksum of any block of our
 840                          * extent starting at an offset of 40K or higher, will
 841                          * end up looking at the second csum item only, which
 842                          * does not contain the checksum for any block starting
 843                          * at offset 40K or higher of our extent.
 844                          */
 845                         while (!list_empty(&ordered_sums)) {
 846                                 struct btrfs_ordered_sum *sums;
 847                                 sums = list_entry(ordered_sums.next,
 848                                                 struct btrfs_ordered_sum,
 849                                                 list);
 850                                 if (!ret)
 851                                         ret = btrfs_del_csums(trans,
 852                                                               fs_info->csum_root,
 853                                                               sums->bytenr,
 854                                                               sums->len);
 855                                 if (!ret)
 856                                         ret = btrfs_csum_file_blocks(trans,
 857                                                 fs_info->csum_root, sums);
 858                                 list_del(&sums->list);
 859                                 kfree(sums);
 860                         }
 861                         if (ret)
 862                                 goto out;
 863                 } else {
 864                         btrfs_release_path(path);
 865                 }
 866         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 867                 /* inline extents are easy, we just overwrite them */
 868                 ret = overwrite_item(trans, root, path, eb, slot, key);
 869                 if (ret)
 870                         goto out;
 871         }
 872
 873         ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
 874                                                 extent_end - start);
 875         if (ret)
 876                 goto out;
 877
 878 update_inode:
 879         btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
 880         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 881 out:
 882         if (inode)
 883                 iput(inode);
 884         return ret;
 885 }
 886
 887 /*
 888  * when cleaning up conflicts between the directory names in the
 889  * subvolume, directory names in the log and directory names in the
 890  * inode back references, we may have to unlink inodes from directories.
 891  *
 892  * This is a helper function to do the unlink of a specific directory
 893  * item
 894  */
 895 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 896                                       struct btrfs_root *root,
 897                                       struct btrfs_path *path,
 898                                       struct btrfs_inode *dir,
 899                                       struct btrfs_dir_item *di)
 900 {
 901         struct inode *inode;
 902         char *name;
 903         int name_len;
 904         struct extent_buffer *leaf;
 905         struct btrfs_key location;
 906         int ret;
 907
 908         leaf = path->nodes[0];
 909
 910         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 911         name_len = btrfs_dir_name_len(leaf, di);
 912         name = kmalloc(name_len, GFP_NOFS);
 913         if (!name)
 914                 return -ENOMEM;
 915
 916         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 917         btrfs_release_path(path);
 918
 919         inode = read_one_inode(root, location.objectid);
 920         if (!inode) {
 921                 ret = -EIO;
 922                 goto out;
 923         }
 924
 925         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 926         if (ret)
 927                 goto out;
 928
 929         ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
 930                         name_len);
 931         if (ret)
 932                 goto out;
 933         else
 934                 ret = btrfs_run_delayed_items(trans);
 935 out:
 936         kfree(name);
 937         iput(inode);
 938         return ret;
 939 }
 940
 941 /*
 942  * helper function to see if a given name and sequence number found
 943  * in an inode back reference are already in a directory and correctly
 944  * point to this inode
 945  */
 946 static noinline int inode_in_dir(struct btrfs_root *root,
 947                                  struct btrfs_path *path,
 948                                  u64 dirid, u64 objectid, u64 index,
 949                                  const char *name, int name_len)
 950 {
 951         struct btrfs_dir_item *di;
 952         struct btrfs_key location;
 953         int match = 0;
 954
 955         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 956                                          index, name, name_len, 0);
 957         if (di && !IS_ERR(di)) {
 958                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 959                 if (location.objectid != objectid)
 960                         goto out;
 961         } else
 962                 goto out;
 963         btrfs_release_path(path);
 964
 965         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 966         if (di && !IS_ERR(di)) {
 967                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 968                 if (location.objectid != objectid)
 969                         goto out;
 970         } else
 971                 goto out;
 972         match = 1;
 973 out:
 974         btrfs_release_path(path);
 975         return match;
 976 }
 977
 978 /*
 979  * helper function to check a log tree for a named back reference in
 980  * an inode.  This is used to decide if a back reference that is
 981  * found in the subvolume conflicts with what we find in the log.
 982  *
 983  * inode backreferences may have multiple refs in a single item,
 984  * during replay we process one reference at a time, and we don't
 985  * want to delete valid links to a file from the subvolume if that
 986  * link is also in the log.
 987  */
 988 static noinline int backref_in_log(struct btrfs_root *log,
 989                                    struct btrfs_key *key,
 990                                    u64 ref_objectid,
 991                                    const char *name, int namelen)
 992 {
 993         struct btrfs_path *path;
 994         int ret;
 995
 996         path = btrfs_alloc_path();
 997         if (!path)
 998                 return -ENOMEM;
 999
1000         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1001         if (ret < 0) {
1002                 goto out;
1003         } else if (ret == 1) {
1004                 ret = 0;
1005                 goto out;
1006         }
1007
1008         if (key->type == BTRFS_INODE_EXTREF_KEY)
1009                 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1010                                                        path->slots[0],
1011                                                        ref_objectid,
1012                                                        name, namelen);
1013         else
1014                 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1015                                                    path->slots[0],
1016                                                    name, namelen);
1017 out:
1018         btrfs_free_path(path);
1019         return ret;
1020 }
1021
1022 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1023                                   struct btrfs_root *root,
1024                                   struct btrfs_path *path,
1025                                   struct btrfs_root *log_root,
1026                                   struct btrfs_inode *dir,
1027                                   struct btrfs_inode *inode,
1028                                   u64 inode_objectid, u64 parent_objectid,
1029                                   u64 ref_index, char *name, int namelen,
1030                                   int *search_done)
1031 {
1032         int ret;
1033         char *victim_name;
1034         int victim_name_len;
1035         struct extent_buffer *leaf;
1036         struct btrfs_dir_item *di;
1037         struct btrfs_key search_key;
1038         struct btrfs_inode_extref *extref;
1039
1040 again:
1041         /* Search old style refs */
1042         search_key.objectid = inode_objectid;
1043         search_key.type = BTRFS_INODE_REF_KEY;
1044         search_key.offset = parent_objectid;
1045         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1046         if (ret == 0) {
1047                 struct btrfs_inode_ref *victim_ref;
1048                 unsigned long ptr;
1049                 unsigned long ptr_end;
1050
1051                 leaf = path->nodes[0];
1052
1053                 /* are we trying to overwrite a back ref for the root directory
1054                  * if so, just jump out, we're done
1055                  */
1056                 if (search_key.objectid == search_key.offset)
1057                         return 1;
1058
1059                 /* check all the names in this back reference to see
1060                  * if they are in the log.  if so, we allow them to stay
1061                  * otherwise they must be unlinked as a conflict
1062                  */
1063                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1064                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1065                 while (ptr < ptr_end) {
1066                         victim_ref = (struct btrfs_inode_ref *)ptr;
1067                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1068                                                                    victim_ref);
1069                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1070                         if (!victim_name)
1071                                 return -ENOMEM;
1072
1073                         read_extent_buffer(leaf, victim_name,
1074                                            (unsigned long)(victim_ref + 1),
1075                                            victim_name_len);
1076
1077                         ret = backref_in_log(log_root, &search_key,
1078                                              parent_objectid, victim_name,
1079                                              victim_name_len);
1080                         if (ret < 0) {
1081                                 kfree(victim_name);
1082                                 return ret;
1083                         } else if (!ret) {
1084                                 inc_nlink(&inode->vfs_inode);
1085                                 btrfs_release_path(path);
1086
1087                                 ret = btrfs_unlink_inode(trans, root, dir, inode,
1088                                                 victim_name, victim_name_len);
1089                                 kfree(victim_name);
1090                                 if (ret)
1091                                         return ret;
1092                                 ret = btrfs_run_delayed_items(trans);
1093                                 if (ret)
1094                                         return ret;
1095                                 *search_done = 1;
1096                                 goto again;
1097                         }
1098                         kfree(victim_name);
1099
1100                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1101                 }
1102
1103                 /*
1104                  * NOTE: we have searched root tree and checked the
1105                  * corresponding ref, it does not need to check again.
1106                  */
1107                 *search_done = 1;
1108         }
1109         btrfs_release_path(path);
1110
1111         /* Same search but for extended refs */
1112         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1113                                            inode_objectid, parent_objectid, 0,
1114                                            0);
1115         if (!IS_ERR_OR_NULL(extref)) {
1116                 u32 item_size;
1117                 u32 cur_offset = 0;
1118                 unsigned long base;
1119                 struct inode *victim_parent;
1120
1121                 leaf = path->nodes[0];
1122
1123                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1124                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1125
1126                 while (cur_offset < item_size) {
1127                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1128
1129                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1130
1131                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1132                                 goto next;
1133
1134                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1135                         if (!victim_name)
1136                                 return -ENOMEM;
1137                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1138                                            victim_name_len);
1139
1140                         search_key.objectid = inode_objectid;
1141                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1142                         search_key.offset = btrfs_extref_hash(parent_objectid,
1143                                                               victim_name,
1144                                                               victim_name_len);
1145                         ret = backref_in_log(log_root, &search_key,
1146                                              parent_objectid, victim_name,
1147                                              victim_name_len);
1148                         if (ret < 0) {
1149                                 return ret;
1150                         } else if (!ret) {
1151                                 ret = -ENOENT;
1152                                 victim_parent = read_one_inode(root,
1153                                                 parent_objectid);
1154                                 if (victim_parent) {
1155                                         inc_nlink(&inode->vfs_inode);
1156                                         btrfs_release_path(path);
1157
1158                                         ret = btrfs_unlink_inode(trans, root,
1159                                                         BTRFS_I(victim_parent),
1160                                                         inode,
1161                                                         victim_name,
1162                                                         victim_name_len);
1163                                         if (!ret)
1164                                                 ret = btrfs_run_delayed_items(
1165                                                                   trans);
1166                                 }
1167                                 iput(victim_parent);
1168                                 kfree(victim_name);
1169                                 if (ret)
1170                                         return ret;
1171                                 *search_done = 1;
1172                                 goto again;
1173                         }
1174                         kfree(victim_name);
1175 next:
1176                         cur_offset += victim_name_len + sizeof(*extref);
1177                 }
1178                 *search_done = 1;
1179         }
1180         btrfs_release_path(path);
1181
1182         /* look for a conflicting sequence number */
1183         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1184                                          ref_index, name, namelen, 0);
1185         if (di && !IS_ERR(di)) {
1186                 ret = drop_one_dir_item(trans, root, path, dir, di);
1187                 if (ret)
1188                         return ret;
1189         }
1190         btrfs_release_path(path);
1191
1192         /* look for a conflicting name */
1193         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1194                                    name, namelen, 0);
1195         if (di && !IS_ERR(di)) {
1196                 ret = drop_one_dir_item(trans, root, path, dir, di);
1197                 if (ret)
1198                         return ret;
1199         }
1200         btrfs_release_path(path);
1201
1202         return 0;
1203 }
1204
1205 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1206                              u32 *namelen, char **name, u64 *index,
1207                              u64 *parent_objectid)
1208 {
1209         struct btrfs_inode_extref *extref;
1210
1211         extref = (struct btrfs_inode_extref *)ref_ptr;
1212
1213         *namelen = btrfs_inode_extref_name_len(eb, extref);
1214         *name = kmalloc(*namelen, GFP_NOFS);
1215         if (*name == NULL)
1216                 return -ENOMEM;
1217
1218         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1219                            *namelen);
1220
1221         if (index)
1222                 *index = btrfs_inode_extref_index(eb, extref);
1223         if (parent_objectid)
1224                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1225
1226         return 0;
1227 }
1228
1229 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1230                           u32 *namelen, char **name, u64 *index)
1231 {
1232         struct btrfs_inode_ref *ref;
1233
1234         ref = (struct btrfs_inode_ref *)ref_ptr;
1235
1236         *namelen = btrfs_inode_ref_name_len(eb, ref);
1237         *name = kmalloc(*namelen, GFP_NOFS);
1238         if (*name == NULL)
1239                 return -ENOMEM;
1240
1241         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1242
1243         if (index)
1244                 *index = btrfs_inode_ref_index(eb, ref);
1245
1246         return 0;
1247 }
1248
1249 /*
1250  * Take an inode reference item from the log tree and iterate all names from the
1251  * inode reference item in the subvolume tree with the same key (if it exists).
1252  * For any name that is not in the inode reference item from the log tree, do a
1253  * proper unlink of that name (that is, remove its entry from the inode
1254  * reference item and both dir index keys).
1255  */
1256 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1257                                  struct btrfs_root *root,
1258                                  struct btrfs_path *path,
1259                                  struct btrfs_inode *inode,
1260                                  struct extent_buffer *log_eb,
1261                                  int log_slot,
1262                                  struct btrfs_key *key)
1263 {
1264         int ret;
1265         unsigned long ref_ptr;
1266         unsigned long ref_end;
1267         struct extent_buffer *eb;
1268
1269 again:
1270         btrfs_release_path(path);
1271         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1272         if (ret > 0) {
1273                 ret = 0;
1274                 goto out;
1275         }
1276         if (ret < 0)
1277                 goto out;
1278
1279         eb = path->nodes[0];
1280         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1281         ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1282         while (ref_ptr < ref_end) {
1283                 char *name = NULL;
1284                 int namelen;
1285                 u64 parent_id;
1286
1287                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1288                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1289                                                 NULL, &parent_id);
1290                 } else {
1291                         parent_id = key->offset;
1292                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1293                                              NULL);
1294                 }
1295                 if (ret)
1296                         goto out;
1297
1298                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1299                         ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1300                                                                parent_id, name,
1301                                                                namelen);
1302                 else
1303                         ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1304                                                            name, namelen);
1305
1306                 if (!ret) {
1307                         struct inode *dir;
1308
1309                         btrfs_release_path(path);
1310                         dir = read_one_inode(root, parent_id);
1311                         if (!dir) {
1312                                 ret = -ENOENT;
1313                                 kfree(name);
1314                                 goto out;
1315                         }
1316                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1317                                                  inode, name, namelen);
1318                         kfree(name);
1319                         iput(dir);
1320                         if (ret)
1321                                 goto out;
1322                         goto again;
1323                 }
1324
1325                 kfree(name);
1326                 ref_ptr += namelen;
1327                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1328                         ref_ptr += sizeof(struct btrfs_inode_extref);
1329                 else
1330                         ref_ptr += sizeof(struct btrfs_inode_ref);
1331         }
1332         ret = 0;
1333  out:
1334         btrfs_release_path(path);
1335         return ret;
1336 }
1337
1338 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1339                                   const u8 ref_type, const char *name,
1340                                   const int namelen)
1341 {
1342         struct btrfs_key key;
1343         struct btrfs_path *path;
1344         const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1345         int ret;
1346
1347         path = btrfs_alloc_path();
1348         if (!path)
1349                 return -ENOMEM;
1350
1351         key.objectid = btrfs_ino(BTRFS_I(inode));
1352         key.type = ref_type;
1353         if (key.type == BTRFS_INODE_REF_KEY)
1354                 key.offset = parent_id;
1355         else
1356                 key.offset = btrfs_extref_hash(parent_id, name, namelen);
1357
1358         ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1359         if (ret < 0)
1360                 goto out;
1361         if (ret > 0) {
1362                 ret = 0;
1363                 goto out;
1364         }
1365         if (key.type == BTRFS_INODE_EXTREF_KEY)
1366                 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1367                                 path->slots[0], parent_id, name, namelen);
1368         else
1369                 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1370                                                    name, namelen);
1371
1372 out:
1373         btrfs_free_path(path);
1374         return ret;
1375 }
1376
1377 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1378                     struct inode *dir, struct inode *inode, const char *name,
1379                     int namelen, u64 ref_index)
1380 {
1381         struct btrfs_dir_item *dir_item;
1382         struct btrfs_key key;
1383         struct btrfs_path *path;
1384         struct inode *other_inode = NULL;
1385         int ret;
1386
1387         path = btrfs_alloc_path();
1388         if (!path)
1389                 return -ENOMEM;
1390
1391         dir_item = btrfs_lookup_dir_item(NULL, root, path,
1392                                          btrfs_ino(BTRFS_I(dir)),
1393                                          name, namelen, 0);
1394         if (!dir_item) {
1395                 btrfs_release_path(path);
1396                 goto add_link;
1397         } else if (IS_ERR(dir_item)) {
1398                 ret = PTR_ERR(dir_item);
1399                 goto out;
1400         }
1401
1402         /*
1403          * Our inode's dentry collides with the dentry of another inode which is
1404          * in the log but not yet processed since it has a higher inode number.
1405          * So delete that other dentry.
1406          */
1407         btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1408         btrfs_release_path(path);
1409         other_inode = read_one_inode(root, key.objectid);
1410         if (!other_inode) {
1411                 ret = -ENOENT;
1412                 goto out;
1413         }
1414         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1415                                  name, namelen);
1416         if (ret)
1417                 goto out;
1418         /*
1419          * If we dropped the link count to 0, bump it so that later the iput()
1420          * on the inode will not free it. We will fixup the link count later.
1421          */
1422         if (other_inode->i_nlink == 0)
1423                 inc_nlink(other_inode);
1424
1425         ret = btrfs_run_delayed_items(trans);
1426         if (ret)
1427                 goto out;
1428 add_link:
1429         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1430                              name, namelen, 0, ref_index);
1431 out:
1432         iput(other_inode);
1433         btrfs_free_path(path);
1434
1435         return ret;
1436 }
1437
1438 /*
1439  * replay one inode back reference item found in the log tree.
1440  * eb, slot and key refer to the buffer and key found in the log tree.
1441  * root is the destination we are replaying into, and path is for temp
1442  * use by this function.  (it should be released on return).
1443  */
1444 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1445                                   struct btrfs_root *root,
1446                                   struct btrfs_root *log,
1447                                   struct btrfs_path *path,
1448                                   struct extent_buffer *eb, int slot,
1449                                   struct btrfs_key *key)
1450 {
1451         struct inode *dir = NULL;
1452         struct inode *inode = NULL;
1453         unsigned long ref_ptr;
1454         unsigned long ref_end;
1455         char *name = NULL;
1456         int namelen;
1457         int ret;
1458         int search_done = 0;
1459         int log_ref_ver = 0;
1460         u64 parent_objectid;
1461         u64 inode_objectid;
1462         u64 ref_index = 0;
1463         int ref_struct_size;
1464
1465         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1466         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1467
1468         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1469                 struct btrfs_inode_extref *r;
1470
1471                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1472                 log_ref_ver = 1;
1473                 r = (struct btrfs_inode_extref *)ref_ptr;
1474                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1475         } else {
1476                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1477                 parent_objectid = key->offset;
1478         }
1479         inode_objectid = key->objectid;
1480
1481         /*
1482          * it is possible that we didn't log all the parent directories
1483          * for a given inode.  If we don't find the dir, just don't
1484          * copy the back ref in.  The link count fixup code will take
1485          * care of the rest
1486          */
1487         dir = read_one_inode(root, parent_objectid);
1488         if (!dir) {
1489                 ret = -ENOENT;
1490                 goto out;
1491         }
1492
1493         inode = read_one_inode(root, inode_objectid);
1494         if (!inode) {
1495                 ret = -EIO;
1496                 goto out;
1497         }
1498
1499         while (ref_ptr < ref_end) {
1500                 if (log_ref_ver) {
1501                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1502                                                 &ref_index, &parent_objectid);
1503                         /*
1504                          * parent object can change from one array
1505                          * item to another.
1506                          */
1507                         if (!dir)
1508                                 dir = read_one_inode(root, parent_objectid);
1509                         if (!dir) {
1510                                 ret = -ENOENT;
1511                                 goto out;
1512                         }
1513                 } else {
1514                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1515                                              &ref_index);
1516                 }
1517                 if (ret)
1518                         goto out;
1519
1520                 /* if we already have a perfect match, we're done */
1521                 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1522                                         btrfs_ino(BTRFS_I(inode)), ref_index,
1523                                         name, namelen)) {
1524                         /*
1525                          * look for a conflicting back reference in the
1526                          * metadata. if we find one we have to unlink that name
1527                          * of the file before we add our new link.  Later on, we
1528                          * overwrite any existing back reference, and we don't
1529                          * want to create dangling pointers in the directory.
1530                          */
1531
1532                         if (!search_done) {
1533                                 ret = __add_inode_ref(trans, root, path, log,
1534                                                       BTRFS_I(dir),
1535                                                       BTRFS_I(inode),
1536                                                       inode_objectid,
1537                                                       parent_objectid,
1538                                                       ref_index, name, namelen,
1539                                                       &search_done);
1540                                 if (ret) {
1541                                         if (ret == 1)
1542                                                 ret = 0;
1543                                         goto out;
1544                                 }
1545                         }
1546
1547                         /*
1548                          * If a reference item already exists for this inode
1549                          * with the same parent and name, but different index,
1550                          * drop it and the corresponding directory index entries
1551                          * from the parent before adding the new reference item
1552                          * and dir index entries, otherwise we would fail with
1553                          * -EEXIST returned from btrfs_add_link() below.
1554                          */
1555                         ret = btrfs_inode_ref_exists(inode, dir, key->type,
1556                                                      name, namelen);
1557                         if (ret > 0) {
1558                                 ret = btrfs_unlink_inode(trans, root,
1559                                                          BTRFS_I(dir),
1560                                                          BTRFS_I(inode),
1561                                                          name, namelen);
1562                                 /*
1563                                  * If we dropped the link count to 0, bump it so
1564                                  * that later the iput() on the inode will not
1565                                  * free it. We will fixup the link count later.
1566                                  */
1567                                 if (!ret && inode->i_nlink == 0)
1568                                         inc_nlink(inode);
1569                         }
1570                         if (ret < 0)
1571                                 goto out;
1572
1573                         /* insert our name */
1574                         ret = add_link(trans, root, dir, inode, name, namelen,
1575                                        ref_index);
1576                         if (ret)
1577                                 goto out;
1578
1579                         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1580                         if (ret)
1581                                 goto out;
1582                 }
1583
1584                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1585                 kfree(name);
1586                 name = NULL;
1587                 if (log_ref_ver) {
1588                         iput(dir);
1589                         dir = NULL;
1590                 }
1591         }
1592
1593         /*
1594          * Before we overwrite the inode reference item in the subvolume tree
1595          * with the item from the log tree, we must unlink all names from the
1596          * parent directory that are in the subvolume's tree inode reference
1597          * item, otherwise we end up with an inconsistent subvolume tree where
1598          * dir index entries exist for a name but there is no inode reference
1599          * item with the same name.
1600          */
1601         ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1602                                     key);
1603         if (ret)
1604                 goto out;
1605
1606         /* finally write the back reference in the inode */
1607         ret = overwrite_item(trans, root, path, eb, slot, key);
1608 out:
1609         btrfs_release_path(path);
1610         kfree(name);
1611         iput(dir);
1612         iput(inode);
1613         return ret;
1614 }
1615
1616 static int count_inode_extrefs(struct btrfs_root *root,
1617                 struct btrfs_inode *inode, struct btrfs_path *path)
1618 {
1619         int ret = 0;
1620         int name_len;
1621         unsigned int nlink = 0;
1622         u32 item_size;
1623         u32 cur_offset = 0;
1624         u64 inode_objectid = btrfs_ino(inode);
1625         u64 offset = 0;
1626         unsigned long ptr;
1627         struct btrfs_inode_extref *extref;
1628         struct extent_buffer *leaf;
1629
1630         while (1) {
1631                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1632                                             &extref, &offset);
1633                 if (ret)
1634                         break;
1635
1636                 leaf = path->nodes[0];
1637                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1638                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1639                 cur_offset = 0;
1640
1641                 while (cur_offset < item_size) {
1642                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1643                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1644
1645                         nlink++;
1646
1647                         cur_offset += name_len + sizeof(*extref);
1648                 }
1649
1650                 offset++;
1651                 btrfs_release_path(path);
1652         }
1653         btrfs_release_path(path);
1654
1655         if (ret < 0 && ret != -ENOENT)
1656                 return ret;
1657         return nlink;
1658 }
1659
1660 static int count_inode_refs(struct btrfs_root *root,
1661                         struct btrfs_inode *inode, struct btrfs_path *path)
1662 {
1663         int ret;
1664         struct btrfs_key key;
1665         unsigned int nlink = 0;
1666         unsigned long ptr;
1667         unsigned long ptr_end;
1668         int name_len;
1669         u64 ino = btrfs_ino(inode);
1670
1671         key.objectid = ino;
1672         key.type = BTRFS_INODE_REF_KEY;
1673         key.offset = (u64)-1;
1674
1675         while (1) {
1676                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1677                 if (ret < 0)
1678                         break;
1679                 if (ret > 0) {
1680                         if (path->slots[0] == 0)
1681                                 break;
1682                         path->slots[0]--;
1683                 }
1684 process_slot:
1685                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1686                                       path->slots[0]);
1687                 if (key.objectid != ino ||
1688                     key.type != BTRFS_INODE_REF_KEY)
1689                         break;
1690                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1691                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1692                                                    path->slots[0]);
1693                 while (ptr < ptr_end) {
1694                         struct btrfs_inode_ref *ref;
1695
1696                         ref = (struct btrfs_inode_ref *)ptr;
1697                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1698                                                             ref);
1699                         ptr = (unsigned long)(ref + 1) + name_len;
1700                         nlink++;
1701                 }
1702
1703                 if (key.offset == 0)
1704                         break;
1705                 if (path->slots[0] > 0) {
1706                         path->slots[0]--;
1707                         goto process_slot;
1708                 }
1709                 key.offset--;
1710                 btrfs_release_path(path);
1711         }
1712         btrfs_release_path(path);
1713
1714         return nlink;
1715 }
1716
1717 /*
1718  * There are a few corners where the link count of the file can't
1719  * be properly maintained during replay.  So, instead of adding
1720  * lots of complexity to the log code, we just scan the backrefs
1721  * for any file that has been through replay.
1722  *
1723  * The scan will update the link count on the inode to reflect the
1724  * number of back refs found.  If it goes down to zero, the iput
1725  * will free the inode.
1726  */
1727 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1728                                            struct btrfs_root *root,
1729                                            struct inode *inode)
1730 {
1731         struct btrfs_path *path;
1732         int ret;
1733         u64 nlink = 0;
1734         u64 ino = btrfs_ino(BTRFS_I(inode));
1735
1736         path = btrfs_alloc_path();
1737         if (!path)
1738                 return -ENOMEM;
1739
1740         ret = count_inode_refs(root, BTRFS_I(inode), path);
1741         if (ret < 0)
1742                 goto out;
1743
1744         nlink = ret;
1745
1746         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1747         if (ret < 0)
1748                 goto out;
1749
1750         nlink += ret;
1751
1752         ret = 0;
1753
1754         if (nlink != inode->i_nlink) {
1755                 set_nlink(inode, nlink);
1756                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1757                 if (ret)
1758                         goto out;
1759         }
1760         BTRFS_I(inode)->index_cnt = (u64)-1;
1761
1762         if (inode->i_nlink == 0) {
1763                 if (S_ISDIR(inode->i_mode)) {
1764                         ret = replay_dir_deletes(trans, root, NULL, path,
1765                                                  ino, 1);
1766                         if (ret)
1767                                 goto out;
1768                 }
1769                 ret = btrfs_insert_orphan_item(trans, root, ino);
1770                 if (ret == -EEXIST)
1771                         ret = 0;
1772         }
1773
1774 out:
1775         btrfs_free_path(path);
1776         return ret;
1777 }
1778
1779 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1780                                             struct btrfs_root *root,
1781                                             struct btrfs_path *path)
1782 {
1783         int ret;
1784         struct btrfs_key key;
1785         struct inode *inode;
1786
1787         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1788         key.type = BTRFS_ORPHAN_ITEM_KEY;
1789         key.offset = (u64)-1;
1790         while (1) {
1791                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1792                 if (ret < 0)
1793                         break;
1794
1795                 if (ret == 1) {
1796                         ret = 0;
1797                         if (path->slots[0] == 0)
1798                                 break;
1799                         path->slots[0]--;
1800                 }
1801
1802                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1803                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1804                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1805                         break;
1806
1807                 ret = btrfs_del_item(trans, root, path);
1808                 if (ret)
1809                         break;
1810
1811                 btrfs_release_path(path);
1812                 inode = read_one_inode(root, key.offset);
1813                 if (!inode) {
1814                         ret = -EIO;
1815                         break;
1816                 }
1817
1818                 ret = fixup_inode_link_count(trans, root, inode);
1819                 iput(inode);
1820                 if (ret)
1821                         break;
1822
1823                 /*
1824                  * fixup on a directory may create new entries,
1825                  * make sure we always look for the highset possible
1826                  * offset
1827                  */
1828                 key.offset = (u64)-1;
1829         }
1830         btrfs_release_path(path);
1831         return ret;
1832 }
1833
1834
1835 /*
1836  * record a given inode in the fixup dir so we can check its link
1837  * count when replay is done.  The link count is incremented here
1838  * so the inode won't go away until we check it
1839  */
1840 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1841                                       struct btrfs_root *root,
1842                                       struct btrfs_path *path,
1843                                       u64 objectid)
1844 {
1845         struct btrfs_key key;
1846         int ret = 0;
1847         struct inode *inode;
1848
1849         inode = read_one_inode(root, objectid);
1850         if (!inode)
1851                 return -EIO;
1852
1853         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1854         key.type = BTRFS_ORPHAN_ITEM_KEY;
1855         key.offset = objectid;
1856
1857         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1858
1859         btrfs_release_path(path);
1860         if (ret == 0) {
1861                 if (!inode->i_nlink)
1862                         set_nlink(inode, 1);
1863                 else
1864                         inc_nlink(inode);
1865                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1866         } else if (ret == -EEXIST) {
1867                 ret = 0;
1868         }
1869         iput(inode);
1870
1871         return ret;
1872 }
1873
1874 /*
1875  * when replaying the log for a directory, we only insert names
1876  * for inodes that actually exist.  This means an fsync on a directory
1877  * does not implicitly fsync all the new files in it
1878  */
1879 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1880                                     struct btrfs_root *root,
1881                                     u64 dirid, u64 index,
1882                                     char *name, int name_len,
1883                                     struct btrfs_key *location)
1884 {
1885         struct inode *inode;
1886         struct inode *dir;
1887         int ret;
1888
1889         inode = read_one_inode(root, location->objectid);
1890         if (!inode)
1891                 return -ENOENT;
1892
1893         dir = read_one_inode(root, dirid);
1894         if (!dir) {
1895                 iput(inode);
1896                 return -EIO;
1897         }
1898
1899         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1900                         name_len, 1, index);
1901
1902         /* FIXME, put inode into FIXUP list */
1903
1904         iput(inode);
1905         iput(dir);
1906         return ret;
1907 }
1908
1909 /*
1910  * take a single entry in a log directory item and replay it into
1911  * the subvolume.
1912  *
1913  * if a conflicting item exists in the subdirectory already,
1914  * the inode it points to is unlinked and put into the link count
1915  * fix up tree.
1916  *
1917  * If a name from the log points to a file or directory that does
1918  * not exist in the FS, it is skipped.  fsyncs on directories
1919  * do not force down inodes inside that directory, just changes to the
1920  * names or unlinks in a directory.
1921  *
1922  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1923  * non-existing inode) and 1 if the name was replayed.
1924  */
1925 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1926                                     struct btrfs_root *root,
1927                                     struct btrfs_path *path,
1928                                     struct extent_buffer *eb,
1929                                     struct btrfs_dir_item *di,
1930                                     struct btrfs_key *key)
1931 {
1932         char *name;
1933         int name_len;
1934         struct btrfs_dir_item *dst_di;
1935         struct btrfs_key found_key;
1936         struct btrfs_key log_key;
1937         struct inode *dir;
1938         u8 log_type;
1939         int exists;
1940         int ret = 0;
1941         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1942         bool name_added = false;
1943
1944         dir = read_one_inode(root, key->objectid);
1945         if (!dir)
1946                 return -EIO;
1947
1948         name_len = btrfs_dir_name_len(eb, di);
1949         name = kmalloc(name_len, GFP_NOFS);
1950         if (!name) {
1951                 ret = -ENOMEM;
1952                 goto out;
1953         }
1954
1955         log_type = btrfs_dir_type(eb, di);
1956         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1957                    name_len);
1958
1959         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1960         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1961         if (exists == 0)
1962                 exists = 1;
1963         else
1964                 exists = 0;
1965         btrfs_release_path(path);
1966
1967         if (key->type == BTRFS_DIR_ITEM_KEY) {
1968                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1969                                        name, name_len, 1);
1970         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1971                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1972                                                      key->objectid,
1973                                                      key->offset, name,
1974                                                      name_len, 1);
1975         } else {
1976                 /* Corruption */
1977                 ret = -EINVAL;
1978                 goto out;
1979         }
1980         if (IS_ERR_OR_NULL(dst_di)) {
1981                 /* we need a sequence number to insert, so we only
1982                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1983                  */
1984                 if (key->type != BTRFS_DIR_INDEX_KEY)
1985                         goto out;
1986                 goto insert;
1987         }
1988
1989         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1990         /* the existing item matches the logged item */
1991         if (found_key.objectid == log_key.objectid &&
1992             found_key.type == log_key.type &&
1993             found_key.offset == log_key.offset &&
1994             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1995                 update_size = false;
1996                 goto out;
1997         }
1998
1999         /*
2000          * don't drop the conflicting directory entry if the inode
2001          * for the new entry doesn't exist
2002          */
2003         if (!exists)
2004                 goto out;
2005
2006         ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
2007         if (ret)
2008                 goto out;
2009
2010         if (key->type == BTRFS_DIR_INDEX_KEY)
2011                 goto insert;
2012 out:
2013         btrfs_release_path(path);
2014         if (!ret && update_size) {
2015                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2016                 ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
2017         }
2018         kfree(name);
2019         iput(dir);
2020         if (!ret && name_added)
2021                 ret = 1;
2022         return ret;
2023
2024 insert:
2025         /*
2026          * Check if the inode reference exists in the log for the given name,
2027          * inode and parent inode
2028          */
2029         found_key.objectid = log_key.objectid;
2030         found_key.type = BTRFS_INODE_REF_KEY;
2031         found_key.offset = key->objectid;
2032         ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2033         if (ret < 0) {
2034                 goto out;
2035         } else if (ret) {
2036                 /* The dentry will be added later. */
2037                 ret = 0;
2038                 update_size = false;
2039                 goto out;
2040         }
2041
2042         found_key.objectid = log_key.objectid;
2043         found_key.type = BTRFS_INODE_EXTREF_KEY;
2044         found_key.offset = key->objectid;
2045         ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2046                              name_len);
2047         if (ret < 0) {
2048                 goto out;
2049         } else if (ret) {
2050                 /* The dentry will be added later. */
2051                 ret = 0;
2052                 update_size = false;
2053                 goto out;
2054         }
2055         btrfs_release_path(path);
2056         ret = insert_one_name(trans, root, key->objectid, key->offset,
2057                               name, name_len, &log_key);
2058         if (ret && ret != -ENOENT && ret != -EEXIST)
2059                 goto out;
2060         if (!ret)
2061                 name_added = true;
2062         update_size = false;
2063         ret = 0;
2064         goto out;
2065 }
2066
2067 /*
2068  * find all the names in a directory item and reconcile them into
2069  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
2070  * one name in a directory item, but the same code gets used for
2071  * both directory index types
2072  */
2073 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2074                                         struct btrfs_root *root,
2075                                         struct btrfs_path *path,
2076                                         struct extent_buffer *eb, int slot,
2077                                         struct btrfs_key *key)
2078 {
2079         int ret = 0;
2080         u32 item_size = btrfs_item_size_nr(eb, slot);
2081         struct btrfs_dir_item *di;
2082         int name_len;
2083         unsigned long ptr;
2084         unsigned long ptr_end;
2085         struct btrfs_path *fixup_path = NULL;
2086
2087         ptr = btrfs_item_ptr_offset(eb, slot);
2088         ptr_end = ptr + item_size;
2089         while (ptr < ptr_end) {
2090                 di = (struct btrfs_dir_item *)ptr;
2091                 name_len = btrfs_dir_name_len(eb, di);
2092                 ret = replay_one_name(trans, root, path, eb, di, key);
2093                 if (ret < 0)
2094                         break;
2095                 ptr = (unsigned long)(di + 1);
2096                 ptr += name_len;
2097
2098                 /*
2099                  * If this entry refers to a non-directory (directories can not
2100                  * have a link count > 1) and it was added in the transaction
2101                  * that was not committed, make sure we fixup the link count of
2102                  * the inode it the entry points to. Otherwise something like
2103                  * the following would result in a directory pointing to an
2104                  * inode with a wrong link that does not account for this dir
2105                  * entry:
2106                  *
2107                  * mkdir testdir
2108                  * touch testdir/foo
2109                  * touch testdir/bar
2110                  * sync
2111                  *
2112                  * ln testdir/bar testdir/bar_link
2113                  * ln testdir/foo testdir/foo_link
2114                  * xfs_io -c "fsync" testdir/bar
2115                  *
2116                  * <power failure>
2117                  *
2118                  * mount fs, log replay happens
2119                  *
2120                  * File foo would remain with a link count of 1 when it has two
2121                  * entries pointing to it in the directory testdir. This would
2122                  * make it impossible to ever delete the parent directory has
2123                  * it would result in stale dentries that can never be deleted.
2124                  */
2125                 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2126                         struct btrfs_key di_key;
2127
2128                         if (!fixup_path) {
2129                                 fixup_path = btrfs_alloc_path();
2130                                 if (!fixup_path) {
2131                                         ret = -ENOMEM;
2132                                         break;
2133                                 }
2134                         }
2135
2136                         btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2137                         ret = link_to_fixup_dir(trans, root, fixup_path,
2138                                                 di_key.objectid);
2139                         if (ret)
2140                                 break;
2141                 }
2142                 ret = 0;
2143         }
2144         btrfs_free_path(fixup_path);
2145         return ret;
2146 }
2147
2148 /*
2149  * directory replay has two parts.  There are the standard directory
2150  * items in the log copied from the subvolume, and range items
2151  * created in the log while the subvolume was logged.
2152  *
2153  * The range items tell us which parts of the key space the log
2154  * is authoritative for.  During replay, if a key in the subvolume
2155  * directory is in a logged range item, but not actually in the log
2156  * that means it was deleted from the directory before the fsync
2157  * and should be removed.
2158  */
2159 static noinline int find_dir_range(struct btrfs_root *root,
2160                                    struct btrfs_path *path,
2161                                    u64 dirid, int key_type,
2162                                    u64 *start_ret, u64 *end_ret)
2163 {
2164         struct btrfs_key key;
2165         u64 found_end;
2166         struct btrfs_dir_log_item *item;
2167         int ret;
2168         int nritems;
2169
2170         if (*start_ret == (u64)-1)
2171                 return 1;
2172
2173         key.objectid = dirid;
2174         key.type = key_type;
2175         key.offset = *start_ret;
2176
2177         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2178         if (ret < 0)
2179                 goto out;
2180         if (ret > 0) {
2181                 if (path->slots[0] == 0)
2182                         goto out;
2183                 path->slots[0]--;
2184         }
2185         if (ret != 0)
2186                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2187
2188         if (key.type != key_type || key.objectid != dirid) {
2189                 ret = 1;
2190                 goto next;
2191         }
2192         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2193                               struct btrfs_dir_log_item);
2194         found_end = btrfs_dir_log_end(path->nodes[0], item);
2195
2196         if (*start_ret >= key.offset && *start_ret <= found_end) {
2197                 ret = 0;
2198                 *start_ret = key.offset;
2199                 *end_ret = found_end;
2200                 goto out;
2201         }
2202         ret = 1;
2203 next:
2204         /* check the next slot in the tree to see if it is a valid item */
2205         nritems = btrfs_header_nritems(path->nodes[0]);
2206         path->slots[0]++;
2207         if (path->slots[0] >= nritems) {
2208                 ret = btrfs_next_leaf(root, path);
2209                 if (ret)
2210                         goto out;
2211         }
2212
2213         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2214
2215         if (key.type != key_type || key.objectid != dirid) {
2216                 ret = 1;
2217                 goto out;
2218         }
2219         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2220                               struct btrfs_dir_log_item);
2221         found_end = btrfs_dir_log_end(path->nodes[0], item);
2222         *start_ret = key.offset;
2223         *end_ret = found_end;
2224         ret = 0;
2225 out:
2226         btrfs_release_path(path);
2227         return ret;
2228 }
2229
2230 /*
2231  * this looks for a given directory item in the log.  If the directory
2232  * item is not in the log, the item is removed and the inode it points
2233  * to is unlinked
2234  */
2235 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2236                                       struct btrfs_root *root,
2237                                       struct btrfs_root *log,
2238                                       struct btrfs_path *path,
2239                                       struct btrfs_path *log_path,
2240                                       struct inode *dir,
2241                                       struct btrfs_key *dir_key)
2242 {
2243         int ret;
2244         struct extent_buffer *eb;
2245         int slot;
2246         u32 item_size;
2247         struct btrfs_dir_item *di;
2248         struct btrfs_dir_item *log_di;
2249         int name_len;
2250         unsigned long ptr;
2251         unsigned long ptr_end;
2252         char *name;
2253         struct inode *inode;
2254         struct btrfs_key location;
2255
2256 again:
2257         eb = path->nodes[0];
2258         slot = path->slots[0];
2259         item_size = btrfs_item_size_nr(eb, slot);
2260         ptr = btrfs_item_ptr_offset(eb, slot);
2261         ptr_end = ptr + item_size;
2262         while (ptr < ptr_end) {
2263                 di = (struct btrfs_dir_item *)ptr;
2264                 name_len = btrfs_dir_name_len(eb, di);
2265                 name = kmalloc(name_len, GFP_NOFS);
2266                 if (!name) {
2267                         ret = -ENOMEM;
2268                         goto out;
2269                 }
2270                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2271                                   name_len);
2272                 log_di = NULL;
2273                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2274                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
2275                                                        dir_key->objectid,
2276                                                        name, name_len, 0);
2277                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2278                         log_di = btrfs_lookup_dir_index_item(trans, log,
2279                                                      log_path,
2280                                                      dir_key->objectid,
2281                                                      dir_key->offset,
2282                                                      name, name_len, 0);
2283                 }
2284                 if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2285                         btrfs_dir_item_key_to_cpu(eb, di, &location);
2286                         btrfs_release_path(path);
2287                         btrfs_release_path(log_path);
2288                         inode = read_one_inode(root, location.objectid);
2289                         if (!inode) {
2290                                 kfree(name);
2291                                 return -EIO;
2292                         }
2293
2294                         ret = link_to_fixup_dir(trans, root,
2295                                                 path, location.objectid);
2296                         if (ret) {
2297                                 kfree(name);
2298                                 iput(inode);
2299                                 goto out;
2300                         }
2301
2302                         inc_nlink(inode);
2303                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2304                                         BTRFS_I(inode), name, name_len);
2305                         if (!ret)
2306                                 ret = btrfs_run_delayed_items(trans);
2307                         kfree(name);
2308                         iput(inode);
2309                         if (ret)
2310                                 goto out;
2311
2312                         /* there might still be more names under this key
2313                          * check and repeat if required
2314                          */
2315                         ret = btrfs_search_slot(NULL, root, dir_key, path,
2316                                                 0, 0);
2317                         if (ret == 0)
2318                                 goto again;
2319                         ret = 0;
2320                         goto out;
2321                 } else if (IS_ERR(log_di)) {
2322                         kfree(name);
2323                         return PTR_ERR(log_di);
2324                 }
2325                 btrfs_release_path(log_path);
2326                 kfree(name);
2327
2328                 ptr = (unsigned long)(di + 1);
2329                 ptr += name_len;
2330         }
2331         ret = 0;
2332 out:
2333         btrfs_release_path(path);
2334         btrfs_release_path(log_path);
2335         return ret;
2336 }
2337
2338 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2339                               struct btrfs_root *root,
2340                               struct btrfs_root *log,
2341                               struct btrfs_path *path,
2342                               const u64 ino)
2343 {
2344         struct btrfs_key search_key;
2345         struct btrfs_path *log_path;
2346         int i;
2347         int nritems;
2348         int ret;
2349
2350         log_path = btrfs_alloc_path();
2351         if (!log_path)
2352                 return -ENOMEM;
2353
2354         search_key.objectid = ino;
2355         search_key.type = BTRFS_XATTR_ITEM_KEY;
2356         search_key.offset = 0;
2357 again:
2358         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2359         if (ret < 0)
2360                 goto out;
2361 process_leaf:
2362         nritems = btrfs_header_nritems(path->nodes[0]);
2363         for (i = path->slots[0]; i < nritems; i++) {
2364                 struct btrfs_key key;
2365                 struct btrfs_dir_item *di;
2366                 struct btrfs_dir_item *log_di;
2367                 u32 total_size;
2368                 u32 cur;
2369
2370                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2371                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2372                         ret = 0;
2373                         goto out;
2374                 }
2375
2376                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2377                 total_size = btrfs_item_size_nr(path->nodes[0], i);
2378                 cur = 0;
2379                 while (cur < total_size) {
2380                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2381                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2382                         u32 this_len = sizeof(*di) + name_len + data_len;
2383                         char *name;
2384
2385                         name = kmalloc(name_len, GFP_NOFS);
2386                         if (!name) {
2387                                 ret = -ENOMEM;
2388                                 goto out;
2389                         }
2390                         read_extent_buffer(path->nodes[0], name,
2391                                            (unsigned long)(di + 1), name_len);
2392
2393                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2394                                                     name, name_len, 0);
2395                         btrfs_release_path(log_path);
2396                         if (!log_di) {
2397                                 /* Doesn't exist in log tree, so delete it. */
2398                                 btrfs_release_path(path);
2399                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2400                                                         name, name_len, -1);
2401                                 kfree(name);
2402                                 if (IS_ERR(di)) {
2403                                         ret = PTR_ERR(di);
2404                                         goto out;
2405                                 }
2406                                 ASSERT(di);
2407                                 ret = btrfs_delete_one_dir_name(trans, root,
2408                                                                 path, di);
2409                                 if (ret)
2410                                         goto out;
2411                                 btrfs_release_path(path);
2412                                 search_key = key;
2413                                 goto again;
2414                         }
2415                         kfree(name);
2416                         if (IS_ERR(log_di)) {
2417                                 ret = PTR_ERR(log_di);
2418                                 goto out;
2419                         }
2420                         cur += this_len;
2421                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2422                 }
2423         }
2424         ret = btrfs_next_leaf(root, path);
2425         if (ret > 0)
2426                 ret = 0;
2427         else if (ret == 0)
2428                 goto process_leaf;
2429 out:
2430         btrfs_free_path(log_path);
2431         btrfs_release_path(path);
2432         return ret;
2433 }
2434
2435
2436 /*
2437  * deletion replay happens before we copy any new directory items
2438  * out of the log or out of backreferences from inodes.  It
2439  * scans the log to find ranges of keys that log is authoritative for,
2440  * and then scans the directory to find items in those ranges that are
2441  * not present in the log.
2442  *
2443  * Anything we don't find in the log is unlinked and removed from the
2444  * directory.
2445  */
2446 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2447                                        struct btrfs_root *root,
2448                                        struct btrfs_root *log,
2449                                        struct btrfs_path *path,
2450                                        u64 dirid, int del_all)
2451 {
2452         u64 range_start;
2453         u64 range_end;
2454         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2455         int ret = 0;
2456         struct btrfs_key dir_key;
2457         struct btrfs_key found_key;
2458         struct btrfs_path *log_path;
2459         struct inode *dir;
2460
2461         dir_key.objectid = dirid;
2462         dir_key.type = BTRFS_DIR_ITEM_KEY;
2463         log_path = btrfs_alloc_path();
2464         if (!log_path)
2465                 return -ENOMEM;
2466
2467         dir = read_one_inode(root, dirid);
2468         /* it isn't an error if the inode isn't there, that can happen
2469          * because we replay the deletes before we copy in the inode item
2470          * from the log
2471          */
2472         if (!dir) {
2473                 btrfs_free_path(log_path);
2474                 return 0;
2475         }
2476 again:
2477         range_start = 0;
2478         range_end = 0;
2479         while (1) {
2480                 if (del_all)
2481                         range_end = (u64)-1;
2482                 else {
2483                         ret = find_dir_range(log, path, dirid, key_type,
2484                                              &range_start, &range_end);
2485                         if (ret != 0)
2486                                 break;
2487                 }
2488
2489                 dir_key.offset = range_start;
2490                 while (1) {
2491                         int nritems;
2492                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2493                                                 0, 0);
2494                         if (ret < 0)
2495                                 goto out;
2496
2497                         nritems = btrfs_header_nritems(path->nodes[0]);
2498                         if (path->slots[0] >= nritems) {
2499                                 ret = btrfs_next_leaf(root, path);
2500                                 if (ret == 1)
2501                                         break;
2502                                 else if (ret < 0)
2503                                         goto out;
2504                         }
2505                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2506                                               path->slots[0]);
2507                         if (found_key.objectid != dirid ||
2508                             found_key.type != dir_key.type)
2509                                 goto next_type;
2510
2511                         if (found_key.offset > range_end)
2512                                 break;
2513
2514                         ret = check_item_in_log(trans, root, log, path,
2515                                                 log_path, dir,
2516                                                 &found_key);
2517                         if (ret)
2518                                 goto out;
2519                         if (found_key.offset == (u64)-1)
2520                                 break;
2521                         dir_key.offset = found_key.offset + 1;
2522                 }
2523                 btrfs_release_path(path);
2524                 if (range_end == (u64)-1)
2525                         break;
2526                 range_start = range_end + 1;
2527         }
2528
2529 next_type:
2530         ret = 0;
2531         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2532                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2533                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2534                 btrfs_release_path(path);
2535                 goto again;
2536         }
2537 out:
2538         btrfs_release_path(path);
2539         btrfs_free_path(log_path);
2540         iput(dir);
2541         return ret;
2542 }
2543
2544 /*
2545  * the process_func used to replay items from the log tree.  This
2546  * gets called in two different stages.  The first stage just looks
2547  * for inodes and makes sure they are all copied into the subvolume.
2548  *
2549  * The second stage copies all the other item types from the log into
2550  * the subvolume.  The two stage approach is slower, but gets rid of
2551  * lots of complexity around inodes referencing other inodes that exist
2552  * only in the log (references come from either directory items or inode
2553  * back refs).
2554  */
2555 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2556                              struct walk_control *wc, u64 gen, int level)
2557 {
2558         int nritems;
2559         struct btrfs_path *path;
2560         struct btrfs_root *root = wc->replay_dest;
2561         struct btrfs_key key;
2562         int i;
2563         int ret;
2564
2565         ret = btrfs_read_buffer(eb, gen, level, NULL);
2566         if (ret)
2567                 return ret;
2568
2569         level = btrfs_header_level(eb);
2570
2571         if (level != 0)
2572                 return 0;
2573
2574         path = btrfs_alloc_path();
2575         if (!path)
2576                 return -ENOMEM;
2577
2578         nritems = btrfs_header_nritems(eb);
2579         for (i = 0; i < nritems; i++) {
2580                 btrfs_item_key_to_cpu(eb, &key, i);
2581
2582                 /* inode keys are done during the first stage */
2583                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2584                     wc->stage == LOG_WALK_REPLAY_INODES) {
2585                         struct btrfs_inode_item *inode_item;
2586                         u32 mode;
2587
2588                         inode_item = btrfs_item_ptr(eb, i,
2589                                             struct btrfs_inode_item);
2590                         /*
2591                          * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2592                          * and never got linked before the fsync, skip it, as
2593                          * replaying it is pointless since it would be deleted
2594                          * later. We skip logging tmpfiles, but it's always
2595                          * possible we are replaying a log created with a kernel
2596                          * that used to log tmpfiles.
2597                          */
2598                         if (btrfs_inode_nlink(eb, inode_item) == 0) {
2599                                 wc->ignore_cur_inode = true;
2600                                 continue;
2601                         } else {
2602                                 wc->ignore_cur_inode = false;
2603                         }
2604                         ret = replay_xattr_deletes(wc->trans, root, log,
2605                                                    path, key.objectid);
2606                         if (ret)
2607                                 break;
2608                         mode = btrfs_inode_mode(eb, inode_item);
2609                         if (S_ISDIR(mode)) {
2610                                 ret = replay_dir_deletes(wc->trans,
2611                                          root, log, path, key.objectid, 0);
2612                                 if (ret)
2613                                         break;
2614                         }
2615                         ret = overwrite_item(wc->trans, root, path,
2616                                              eb, i, &key);
2617                         if (ret)
2618                                 break;
2619
2620                         /*
2621                          * Before replaying extents, truncate the inode to its
2622                          * size. We need to do it now and not after log replay
2623                          * because before an fsync we can have prealloc extents
2624                          * added beyond the inode's i_size. If we did it after,
2625                          * through orphan cleanup for example, we would drop
2626                          * those prealloc extents just after replaying them.
2627                          */
2628                         if (S_ISREG(mode)) {
2629                                 struct btrfs_drop_extents_args drop_args = { 0 };
2630                                 struct inode *inode;
2631                                 u64 from;
2632
2633                                 inode = read_one_inode(root, key.objectid);
2634                                 if (!inode) {
2635                                         ret = -EIO;
2636                                         break;
2637                                 }
2638                                 from = ALIGN(i_size_read(inode),
2639                                              root->fs_info->sectorsize);
2640                                 drop_args.start = from;
2641                                 drop_args.end = (u64)-1;
2642                                 drop_args.drop_cache = true;
2643                                 ret = btrfs_drop_extents(wc->trans, root,
2644                                                          BTRFS_I(inode),
2645                                                          &drop_args);
2646                                 if (!ret) {
2647                                         inode_sub_bytes(inode,
2648                                                         drop_args.bytes_found);
2649                                         /* Update the inode's nbytes. */
2650                                         ret = btrfs_update_inode(wc->trans,
2651                                                         root, BTRFS_I(inode));
2652                                 }
2653                                 iput(inode);
2654                                 if (ret)
2655                                         break;
2656                         }
2657
2658                         ret = link_to_fixup_dir(wc->trans, root,
2659                                                 path, key.objectid);
2660                         if (ret)
2661                                 break;
2662                 }
2663
2664                 if (wc->ignore_cur_inode)
2665                         continue;
2666
2667                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2668                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2669                         ret = replay_one_dir_item(wc->trans, root, path,
2670                                                   eb, i, &key);
2671                         if (ret)
2672                                 break;
2673                 }
2674
2675                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2676                         continue;
2677
2678                 /* these keys are simply copied */
2679                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2680                         ret = overwrite_item(wc->trans, root, path,
2681                                              eb, i, &key);
2682                         if (ret)
2683                                 break;
2684                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2685                            key.type == BTRFS_INODE_EXTREF_KEY) {
2686                         ret = add_inode_ref(wc->trans, root, log, path,
2687                                             eb, i, &key);
2688                         if (ret && ret != -ENOENT)
2689                                 break;
2690                         ret = 0;
2691                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2692                         ret = replay_one_extent(wc->trans, root, path,
2693                                                 eb, i, &key);
2694                         if (ret)
2695                                 break;
2696                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2697                         ret = replay_one_dir_item(wc->trans, root, path,
2698                                                   eb, i, &key);
2699                         if (ret)
2700                                 break;
2701                 }
2702         }
2703         btrfs_free_path(path);
2704         return ret;
2705 }
2706
2707 /*
2708  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2709  */
2710 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2711 {
2712         struct btrfs_block_group *cache;
2713
2714         cache = btrfs_lookup_block_group(fs_info, start);
2715         if (!cache) {
2716                 btrfs_err(fs_info, "unable to find block group for %llu", start);
2717                 return;
2718         }
2719
2720         spin_lock(&cache->space_info->lock);
2721         spin_lock(&cache->lock);
2722         cache->reserved -= fs_info->nodesize;
2723         cache->space_info->bytes_reserved -= fs_info->nodesize;
2724         spin_unlock(&cache->lock);
2725         spin_unlock(&cache->space_info->lock);
2726
2727         btrfs_put_block_group(cache);
2728 }
2729
2730 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2731                                    struct btrfs_root *root,
2732                                    struct btrfs_path *path, int *level,
2733                                    struct walk_control *wc)
2734 {
2735         struct btrfs_fs_info *fs_info = root->fs_info;
2736         u64 bytenr;
2737         u64 ptr_gen;
2738         struct extent_buffer *next;
2739         struct extent_buffer *cur;
2740         u32 blocksize;
2741         int ret = 0;
2742
2743         while (*level > 0) {
2744                 struct btrfs_key first_key;
2745
2746                 cur = path->nodes[*level];
2747
2748                 WARN_ON(btrfs_header_level(cur) != *level);
2749
2750                 if (path->slots[*level] >=
2751                     btrfs_header_nritems(cur))
2752                         break;
2753
2754                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2755                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2756                 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2757                 blocksize = fs_info->nodesize;
2758
2759                 next = btrfs_find_create_tree_block(fs_info, bytenr,
2760                                                     btrfs_header_owner(cur),
2761                                                     *level - 1);
2762                 if (IS_ERR(next))
2763                         return PTR_ERR(next);
2764
2765                 if (*level == 1) {
2766                         ret = wc->process_func(root, next, wc, ptr_gen,
2767                                                *level - 1);
2768                         if (ret) {
2769                                 free_extent_buffer(next);
2770                                 return ret;
2771                         }
2772
2773                         path->slots[*level]++;
2774                         if (wc->free) {
2775                                 ret = btrfs_read_buffer(next, ptr_gen,
2776                                                         *level - 1, &first_key);
2777                                 if (ret) {
2778                                         free_extent_buffer(next);
2779                                         return ret;
2780                                 }
2781
2782                                 if (trans) {
2783                                         btrfs_tree_lock(next);
2784                                         btrfs_clean_tree_block(next);
2785                                         btrfs_wait_tree_block_writeback(next);
2786                                         btrfs_tree_unlock(next);
2787                                         ret = btrfs_pin_reserved_extent(trans,
2788                                                         bytenr, blocksize);
2789                                         if (ret) {
2790                                                 free_extent_buffer(next);
2791                                                 return ret;
2792                                         }
2793                                         btrfs_redirty_list_add(
2794                                                 trans->transaction, next);
2795                                 } else {
2796                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2797                                                 clear_extent_buffer_dirty(next);
2798                                         unaccount_log_buffer(fs_info, bytenr);
2799                                 }
2800                         }
2801                         free_extent_buffer(next);
2802                         continue;
2803                 }
2804                 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2805                 if (ret) {
2806                         free_extent_buffer(next);
2807                         return ret;
2808                 }
2809
2810                 if (path->nodes[*level-1])
2811                         free_extent_buffer(path->nodes[*level-1]);
2812                 path->nodes[*level-1] = next;
2813                 *level = btrfs_header_level(next);
2814                 path->slots[*level] = 0;
2815                 cond_resched();
2816         }
2817         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2818
2819         cond_resched();
2820         return 0;
2821 }
2822
2823 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2824                                  struct btrfs_root *root,
2825                                  struct btrfs_path *path, int *level,
2826                                  struct walk_control *wc)
2827 {
2828         struct btrfs_fs_info *fs_info = root->fs_info;
2829         int i;
2830         int slot;
2831         int ret;
2832
2833         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2834                 slot = path->slots[i];
2835                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2836                         path->slots[i]++;
2837                         *level = i;
2838                         WARN_ON(*level == 0);
2839                         return 0;
2840                 } else {
2841                         ret = wc->process_func(root, path->nodes[*level], wc,
2842                                  btrfs_header_generation(path->nodes[*level]),
2843                                  *level);
2844                         if (ret)
2845                                 return ret;
2846
2847                         if (wc->free) {
2848                                 struct extent_buffer *next;
2849
2850                                 next = path->nodes[*level];
2851
2852                                 if (trans) {
2853                                         btrfs_tree_lock(next);
2854                                         btrfs_clean_tree_block(next);
2855                                         btrfs_wait_tree_block_writeback(next);
2856                                         btrfs_tree_unlock(next);
2857                                         ret = btrfs_pin_reserved_extent(trans,
2858                                                      path->nodes[*level]->start,
2859                                                      path->nodes[*level]->len);
2860                                         if (ret)
2861                                                 return ret;
2862                                 } else {
2863                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2864                                                 clear_extent_buffer_dirty(next);
2865
2866                                         unaccount_log_buffer(fs_info,
2867                                                 path->nodes[*level]->start);
2868                                 }
2869                         }
2870                         free_extent_buffer(path->nodes[*level]);
2871                         path->nodes[*level] = NULL;
2872                         *level = i + 1;
2873                 }
2874         }
2875         return 1;
2876 }
2877
2878 /*
2879  * drop the reference count on the tree rooted at 'snap'.  This traverses
2880  * the tree freeing any blocks that have a ref count of zero after being
2881  * decremented.
2882  */
2883 static int walk_log_tree(struct btrfs_trans_handle *trans,
2884                          struct btrfs_root *log, struct walk_control *wc)
2885 {
2886         struct btrfs_fs_info *fs_info = log->fs_info;
2887         int ret = 0;
2888         int wret;
2889         int level;
2890         struct btrfs_path *path;
2891         int orig_level;
2892
2893         path = btrfs_alloc_path();
2894         if (!path)
2895                 return -ENOMEM;
2896
2897         level = btrfs_header_level(log->node);
2898         orig_level = level;
2899         path->nodes[level] = log->node;
2900         atomic_inc(&log->node->refs);
2901         path->slots[level] = 0;
2902
2903         while (1) {
2904                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2905                 if (wret > 0)
2906                         break;
2907                 if (wret < 0) {
2908                         ret = wret;
2909                         goto out;
2910                 }
2911
2912                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2913                 if (wret > 0)
2914                         break;
2915                 if (wret < 0) {
2916                         ret = wret;
2917                         goto out;
2918                 }
2919         }
2920
2921         /* was the root node processed? if not, catch it here */
2922         if (path->nodes[orig_level]) {
2923                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2924                          btrfs_header_generation(path->nodes[orig_level]),
2925                          orig_level);
2926                 if (ret)
2927                         goto out;
2928                 if (wc->free) {
2929                         struct extent_buffer *next;
2930
2931                         next = path->nodes[orig_level];
2932
2933                         if (trans) {
2934                                 btrfs_tree_lock(next);
2935                                 btrfs_clean_tree_block(next);
2936                                 btrfs_wait_tree_block_writeback(next);
2937                                 btrfs_tree_unlock(next);
2938                                 ret = btrfs_pin_reserved_extent(trans,
2939                                                 next->start, next->len);
2940                                 if (ret)
2941                                         goto out;
2942                         } else {
2943                                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2944                                         clear_extent_buffer_dirty(next);
2945                                 unaccount_log_buffer(fs_info, next->start);
2946                         }
2947                 }
2948         }
2949
2950 out:
2951         btrfs_free_path(path);
2952         return ret;
2953 }
2954
2955 /*
2956  * helper function to update the item for a given subvolumes log root
2957  * in the tree of log roots
2958  */
2959 static int update_log_root(struct btrfs_trans_handle *trans,
2960                            struct btrfs_root *log,
2961                            struct btrfs_root_item *root_item)
2962 {
2963         struct btrfs_fs_info *fs_info = log->fs_info;
2964         int ret;
2965
2966         if (log->log_transid == 1) {
2967                 /* insert root item on the first sync */
2968                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2969                                 &log->root_key, root_item);
2970         } else {
2971                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2972                                 &log->root_key, root_item);
2973         }
2974         return ret;
2975 }
2976
2977 static void wait_log_commit(struct btrfs_root *root, int transid)
2978 {
2979         DEFINE_WAIT(wait);
2980         int index = transid % 2;
2981
2982         /*
2983          * we only allow two pending log transactions at a time,
2984          * so we know that if ours is more than 2 older than the
2985          * current transaction, we're done
2986          */
2987         for (;;) {
2988                 prepare_to_wait(&root->log_commit_wait[index],
2989                                 &wait, TASK_UNINTERRUPTIBLE);
2990
2991                 if (!(root->log_transid_committed < transid &&
2992                       atomic_read(&root->log_commit[index])))
2993                         break;
2994
2995                 mutex_unlock(&root->log_mutex);
2996                 schedule();
2997                 mutex_lock(&root->log_mutex);
2998         }
2999         finish_wait(&root->log_commit_wait[index], &wait);
3000 }
3001
3002 static void wait_for_writer(struct btrfs_root *root)
3003 {
3004         DEFINE_WAIT(wait);
3005
3006         for (;;) {
3007                 prepare_to_wait(&root->log_writer_wait, &wait,
3008                                 TASK_UNINTERRUPTIBLE);
3009                 if (!atomic_read(&root->log_writers))
3010                         break;
3011
3012                 mutex_unlock(&root->log_mutex);
3013                 schedule();
3014                 mutex_lock(&root->log_mutex);
3015         }
3016         finish_wait(&root->log_writer_wait, &wait);
3017 }
3018
3019 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3020                                         struct btrfs_log_ctx *ctx)
3021 {
3022         if (!ctx)
3023                 return;
3024
3025         mutex_lock(&root->log_mutex);
3026         list_del_init(&ctx->list);
3027         mutex_unlock(&root->log_mutex);
3028 }
3029
3030 /*
3031  * Invoked in log mutex context, or be sure there is no other task which
3032  * can access the list.
3033  */
3034 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3035                                              int index, int error)
3036 {
3037         struct btrfs_log_ctx *ctx;
3038         struct btrfs_log_ctx *safe;
3039
3040         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3041                 list_del_init(&ctx->list);
3042                 ctx->log_ret = error;
3043         }
3044 }
3045
3046 /*
3047  * btrfs_sync_log does sends a given tree log down to the disk and
3048  * updates the super blocks to record it.  When this call is done,
3049  * you know that any inodes previously logged are safely on disk only
3050  * if it returns 0.
3051  *
3052  * Any other return value means you need to call btrfs_commit_transaction.
3053  * Some of the edge cases for fsyncing directories that have had unlinks
3054  * or renames done in the past mean that sometimes the only safe
3055  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3056  * that has happened.
3057  */
3058 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3059                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3060 {
3061         int index1;
3062         int index2;
3063         int mark;
3064         int ret;
3065         struct btrfs_fs_info *fs_info = root->fs_info;
3066         struct btrfs_root *log = root->log_root;
3067         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3068         struct btrfs_root_item new_root_item;
3069         int log_transid = 0;
3070         struct btrfs_log_ctx root_log_ctx;
3071         struct blk_plug plug;
3072         u64 log_root_start;
3073         u64 log_root_level;
3074
3075         mutex_lock(&root->log_mutex);
3076         log_transid = ctx->log_transid;
3077         if (root->log_transid_committed >= log_transid) {
3078                 mutex_unlock(&root->log_mutex);
3079                 return ctx->log_ret;
3080         }
3081
3082         index1 = log_transid % 2;
3083         if (atomic_read(&root->log_commit[index1])) {
3084                 wait_log_commit(root, log_transid);
3085                 mutex_unlock(&root->log_mutex);
3086                 return ctx->log_ret;
3087         }
3088         ASSERT(log_transid == root->log_transid);
3089         atomic_set(&root->log_commit[index1], 1);
3090
3091         /* wait for previous tree log sync to complete */
3092         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3093                 wait_log_commit(root, log_transid - 1);
3094
3095         while (1) {
3096                 int batch = atomic_read(&root->log_batch);
3097                 /* when we're on an ssd, just kick the log commit out */
3098                 if (!btrfs_test_opt(fs_info, SSD) &&
3099                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3100                         mutex_unlock(&root->log_mutex);
3101                         schedule_timeout_uninterruptible(1);
3102                         mutex_lock(&root->log_mutex);
3103                 }
3104                 wait_for_writer(root);
3105                 if (batch == atomic_read(&root->log_batch))
3106                         break;
3107         }
3108
3109         /* bail out if we need to do a full commit */
3110         if (btrfs_need_log_full_commit(trans)) {
3111                 ret = -EAGAIN;
3112                 mutex_unlock(&root->log_mutex);
3113                 goto out;
3114         }
3115
3116         if (log_transid % 2 == 0)
3117                 mark = EXTENT_DIRTY;
3118         else
3119                 mark = EXTENT_NEW;
3120
3121         /* we start IO on  all the marked extents here, but we don't actually
3122          * wait for them until later.
3123          */
3124         blk_start_plug(&plug);
3125         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3126         /*
3127          * -EAGAIN happens when someone, e.g., a concurrent transaction
3128          *  commit, writes a dirty extent in this tree-log commit. This
3129          *  concurrent write will create a hole writing out the extents,
3130          *  and we cannot proceed on a zoned filesystem, requiring
3131          *  sequential writing. While we can bail out to a full commit
3132          *  here, but we can continue hoping the concurrent writing fills
3133          *  the hole.
3134          */
3135         if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3136                 ret = 0;
3137         if (ret) {
3138                 blk_finish_plug(&plug);
3139                 btrfs_abort_transaction(trans, ret);
3140                 btrfs_set_log_full_commit(trans);
3141                 mutex_unlock(&root->log_mutex);
3142                 goto out;
3143         }
3144
3145         /*
3146          * We _must_ update under the root->log_mutex in order to make sure we
3147          * have a consistent view of the log root we are trying to commit at
3148          * this moment.
3149          *
3150          * We _must_ copy this into a local copy, because we are not holding the
3151          * log_root_tree->log_mutex yet.  This is important because when we
3152          * commit the log_root_tree we must have a consistent view of the
3153          * log_root_tree when we update the super block to point at the
3154          * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3155          * with the commit and possibly point at the new block which we may not
3156          * have written out.
3157          */
3158         btrfs_set_root_node(&log->root_item, log->node);
3159         memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3160
3161         root->log_transid++;
3162         log->log_transid = root->log_transid;
3163         root->log_start_pid = 0;
3164         /*
3165          * IO has been started, blocks of the log tree have WRITTEN flag set
3166          * in their headers. new modifications of the log will be written to
3167          * new positions. so it's safe to allow log writers to go in.
3168          */
3169         mutex_unlock(&root->log_mutex);
3170
3171         if (btrfs_is_zoned(fs_info)) {
3172                 mutex_lock(&fs_info->tree_root->log_mutex);
3173                 if (!log_root_tree->node) {
3174                         ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3175                         if (ret) {
3176                                 mutex_unlock(&fs_info->tree_root->log_mutex);
3177                                 goto out;
3178                         }
3179                 }
3180                 mutex_unlock(&fs_info->tree_root->log_mutex);
3181         }
3182
3183         btrfs_init_log_ctx(&root_log_ctx, NULL);
3184
3185         mutex_lock(&log_root_tree->log_mutex);
3186
3187         index2 = log_root_tree->log_transid % 2;
3188         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3189         root_log_ctx.log_transid = log_root_tree->log_transid;
3190
3191         /*
3192          * Now we are safe to update the log_root_tree because we're under the
3193          * log_mutex, and we're a current writer so we're holding the commit
3194          * open until we drop the log_mutex.
3195          */
3196         ret = update_log_root(trans, log, &new_root_item);
3197         if (ret) {
3198                 if (!list_empty(&root_log_ctx.list))
3199                         list_del_init(&root_log_ctx.list);
3200
3201                 blk_finish_plug(&plug);
3202                 btrfs_set_log_full_commit(trans);
3203
3204                 if (ret != -ENOSPC) {
3205                         btrfs_abort_transaction(trans, ret);
3206                         mutex_unlock(&log_root_tree->log_mutex);
3207                         goto out;
3208                 }
3209                 btrfs_wait_tree_log_extents(log, mark);
3210                 mutex_unlock(&log_root_tree->log_mutex);
3211                 ret = -EAGAIN;
3212                 goto out;
3213         }
3214
3215         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3216                 blk_finish_plug(&plug);
3217                 list_del_init(&root_log_ctx.list);
3218                 mutex_unlock(&log_root_tree->log_mutex);
3219                 ret = root_log_ctx.log_ret;
3220                 goto out;
3221         }
3222
3223         index2 = root_log_ctx.log_transid % 2;
3224         if (atomic_read(&log_root_tree->log_commit[index2])) {
3225                 blk_finish_plug(&plug);
3226                 ret = btrfs_wait_tree_log_extents(log, mark);
3227                 wait_log_commit(log_root_tree,
3228                                 root_log_ctx.log_transid);
3229                 mutex_unlock(&log_root_tree->log_mutex);
3230                 if (!ret)
3231                         ret = root_log_ctx.log_ret;
3232                 goto out;
3233         }
3234         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3235         atomic_set(&log_root_tree->log_commit[index2], 1);
3236
3237         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3238                 wait_log_commit(log_root_tree,
3239                                 root_log_ctx.log_transid - 1);
3240         }
3241
3242         /*
3243          * now that we've moved on to the tree of log tree roots,
3244          * check the full commit flag again
3245          */
3246         if (btrfs_need_log_full_commit(trans)) {
3247                 blk_finish_plug(&plug);
3248                 btrfs_wait_tree_log_extents(log, mark);
3249                 mutex_unlock(&log_root_tree->log_mutex);
3250                 ret = -EAGAIN;
3251                 goto out_wake_log_root;
3252         }
3253
3254         ret = btrfs_write_marked_extents(fs_info,
3255                                          &log_root_tree->dirty_log_pages,
3256                                          EXTENT_DIRTY | EXTENT_NEW);
3257         blk_finish_plug(&plug);
3258         /*
3259          * As described above, -EAGAIN indicates a hole in the extents. We
3260          * cannot wait for these write outs since the waiting cause a
3261          * deadlock. Bail out to the full commit instead.
3262          */
3263         if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3264                 btrfs_set_log_full_commit(trans);
3265                 btrfs_wait_tree_log_extents(log, mark);
3266                 mutex_unlock(&log_root_tree->log_mutex);
3267                 goto out_wake_log_root;
3268         } else if (ret) {
3269                 btrfs_set_log_full_commit(trans);
3270                 btrfs_abort_transaction(trans, ret);
3271                 mutex_unlock(&log_root_tree->log_mutex);
3272                 goto out_wake_log_root;
3273         }
3274         ret = btrfs_wait_tree_log_extents(log, mark);
3275         if (!ret)
3276                 ret = btrfs_wait_tree_log_extents(log_root_tree,
3277                                                   EXTENT_NEW | EXTENT_DIRTY);
3278         if (ret) {
3279                 btrfs_set_log_full_commit(trans);
3280                 mutex_unlock(&log_root_tree->log_mutex);
3281                 goto out_wake_log_root;
3282         }
3283
3284         log_root_start = log_root_tree->node->start;
3285         log_root_level = btrfs_header_level(log_root_tree->node);
3286         log_root_tree->log_transid++;
3287         mutex_unlock(&log_root_tree->log_mutex);
3288
3289         /*
3290          * Here we are guaranteed that nobody is going to write the superblock
3291          * for the current transaction before us and that neither we do write
3292          * our superblock before the previous transaction finishes its commit
3293          * and writes its superblock, because:
3294          *
3295          * 1) We are holding a handle on the current transaction, so no body
3296          *    can commit it until we release the handle;
3297          *
3298          * 2) Before writing our superblock we acquire the tree_log_mutex, so
3299          *    if the previous transaction is still committing, and hasn't yet
3300          *    written its superblock, we wait for it to do it, because a
3301          *    transaction commit acquires the tree_log_mutex when the commit
3302          *    begins and releases it only after writing its superblock.
3303          */
3304         mutex_lock(&fs_info->tree_log_mutex);
3305
3306         /*
3307          * The previous transaction writeout phase could have failed, and thus
3308          * marked the fs in an error state.  We must not commit here, as we
3309          * could have updated our generation in the super_for_commit and
3310          * writing the super here would result in transid mismatches.  If there
3311          * is an error here just bail.
3312          */
3313         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3314                 ret = -EIO;
3315                 btrfs_set_log_full_commit(trans);
3316                 btrfs_abort_transaction(trans, ret);
3317                 mutex_unlock(&fs_info->tree_log_mutex);
3318                 goto out_wake_log_root;
3319         }
3320
3321         btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3322         btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3323         ret = write_all_supers(fs_info, 1);
3324         mutex_unlock(&fs_info->tree_log_mutex);
3325         if (ret) {
3326                 btrfs_set_log_full_commit(trans);
3327                 btrfs_abort_transaction(trans, ret);
3328                 goto out_wake_log_root;
3329         }
3330
3331         /*
3332          * We know there can only be one task here, since we have not yet set
3333          * root->log_commit[index1] to 0 and any task attempting to sync the
3334          * log must wait for the previous log transaction to commit if it's
3335          * still in progress or wait for the current log transaction commit if
3336          * someone else already started it. We use <= and not < because the
3337          * first log transaction has an ID of 0.
3338          */
3339         ASSERT(root->last_log_commit <= log_transid);
3340         root->last_log_commit = log_transid;
3341
3342 out_wake_log_root:
3343         mutex_lock(&log_root_tree->log_mutex);
3344         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3345
3346         log_root_tree->log_transid_committed++;
3347         atomic_set(&log_root_tree->log_commit[index2], 0);
3348         mutex_unlock(&log_root_tree->log_mutex);
3349
3350         /*
3351          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3352          * all the updates above are seen by the woken threads. It might not be
3353          * necessary, but proving that seems to be hard.
3354          */
3355         cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3356 out:
3357         mutex_lock(&root->log_mutex);
3358         btrfs_remove_all_log_ctxs(root, index1, ret);
3359         root->log_transid_committed++;
3360         atomic_set(&root->log_commit[index1], 0);
3361         mutex_unlock(&root->log_mutex);
3362
3363         /*
3364          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3365          * all the updates above are seen by the woken threads. It might not be
3366          * necessary, but proving that seems to be hard.
3367          */
3368         cond_wake_up(&root->log_commit_wait[index1]);
3369         return ret;
3370 }
3371
3372 static void free_log_tree(struct btrfs_trans_handle *trans,
3373                           struct btrfs_root *log)
3374 {
3375         int ret;
3376         struct walk_control wc = {
3377                 .free = 1,
3378                 .process_func = process_one_buffer
3379         };
3380
3381         if (log->node) {
3382                 ret = walk_log_tree(trans, log, &wc);
3383                 if (ret) {
3384                         if (trans)
3385                                 btrfs_abort_transaction(trans, ret);
3386                         else
3387                                 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3388                 }
3389         }
3390
3391         clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3392                           EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3393         extent_io_tree_release(&log->log_csum_range);
3394
3395         if (trans && log->node)
3396                 btrfs_redirty_list_add(trans->transaction, log->node);
3397         btrfs_put_root(log);
3398 }
3399
3400 /*
3401  * free all the extents used by the tree log.  This should be called
3402  * at commit time of the full transaction
3403  */
3404 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3405 {
3406         if (root->log_root) {
3407                 free_log_tree(trans, root->log_root);
3408                 root->log_root = NULL;
3409                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3410         }
3411         return 0;
3412 }
3413
3414 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3415                              struct btrfs_fs_info *fs_info)
3416 {
3417         if (fs_info->log_root_tree) {
3418                 free_log_tree(trans, fs_info->log_root_tree);
3419                 fs_info->log_root_tree = NULL;
3420                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3421         }
3422         return 0;
3423 }
3424
3425 /*
3426  * Check if an inode was logged in the current transaction. This may often
3427  * return some false positives, because logged_trans is an in memory only field,
3428  * not persisted anywhere. This is meant to be used in contexts where a false
3429  * positive has no functional consequences.
3430  */
3431 static bool inode_logged(struct btrfs_trans_handle *trans,
3432                          struct btrfs_inode *inode)
3433 {
3434         if (inode->logged_trans == trans->transid)
3435                 return true;
3436
3437         /*
3438          * The inode's logged_trans is always 0 when we load it (because it is
3439          * not persisted in the inode item or elsewhere). So if it is 0, the
3440          * inode was last modified in the current transaction then the inode may
3441          * have been logged before in the current transaction, then evicted and
3442          * loaded again in the current transaction - or may have never been logged
3443          * in the current transaction, but since we can not be sure, we have to
3444          * assume it was, otherwise our callers can leave an inconsistent log.
3445          */
3446         if (inode->logged_trans == 0 &&
3447             inode->last_trans == trans->transid &&
3448             !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3449                 return true;
3450
3451         return false;
3452 }
3453
3454 /*
3455  * If both a file and directory are logged, and unlinks or renames are
3456  * mixed in, we have a few interesting corners:
3457  *
3458  * create file X in dir Y
3459  * link file X to X.link in dir Y
3460  * fsync file X
3461  * unlink file X but leave X.link
3462  * fsync dir Y
3463  *
3464  * After a crash we would expect only X.link to exist.  But file X
3465  * didn't get fsync'd again so the log has back refs for X and X.link.
3466  *
3467  * We solve this by removing directory entries and inode backrefs from the
3468  * log when a file that was logged in the current transaction is
3469  * unlinked.  Any later fsync will include the updated log entries, and
3470  * we'll be able to reconstruct the proper directory items from backrefs.
3471  *
3472  * This optimizations allows us to avoid relogging the entire inode
3473  * or the entire directory.
3474  */
3475 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3476                                  struct btrfs_root *root,
3477                                  const char *name, int name_len,
3478                                  struct btrfs_inode *dir, u64 index)
3479 {
3480         struct btrfs_root *log;
3481         struct btrfs_dir_item *di;
3482         struct btrfs_path *path;
3483         int ret;
3484         int err = 0;
3485         u64 dir_ino = btrfs_ino(dir);
3486
3487         if (!inode_logged(trans, dir))
3488                 return 0;
3489
3490         ret = join_running_log_trans(root);
3491         if (ret)
3492                 return 0;
3493
3494         mutex_lock(&dir->log_mutex);
3495
3496         log = root->log_root;
3497         path = btrfs_alloc_path();
3498         if (!path) {
3499                 err = -ENOMEM;
3500                 goto out_unlock;
3501         }
3502
3503         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3504                                    name, name_len, -1);
3505         if (IS_ERR(di)) {
3506                 err = PTR_ERR(di);
3507                 goto fail;
3508         }
3509         if (di) {
3510                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3511                 if (ret) {
3512                         err = ret;
3513                         goto fail;
3514                 }
3515         }
3516         btrfs_release_path(path);
3517         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3518                                          index, name, name_len, -1);
3519         if (IS_ERR(di)) {
3520                 err = PTR_ERR(di);
3521                 goto fail;
3522         }
3523         if (di) {
3524                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3525                 if (ret) {
3526                         err = ret;
3527                         goto fail;
3528                 }
3529         }
3530
3531         /*
3532          * We do not need to update the size field of the directory's inode item
3533          * because on log replay we update the field to reflect all existing
3534          * entries in the directory (see overwrite_item()).
3535          */
3536 fail:
3537         btrfs_free_path(path);
3538 out_unlock:
3539         mutex_unlock(&dir->log_mutex);
3540         if (err == -ENOSPC) {
3541                 btrfs_set_log_full_commit(trans);
3542                 err = 0;
3543         } else if (err < 0 && err != -ENOENT) {
3544                 /* ENOENT can be returned if the entry hasn't been fsynced yet */
3545                 btrfs_abort_transaction(trans, err);
3546         }
3547
3548         btrfs_end_log_trans(root);
3549
3550         return err;
3551 }
3552
3553 /* see comments for btrfs_del_dir_entries_in_log */
3554 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3555                                struct btrfs_root *root,
3556                                const char *name, int name_len,
3557                                struct btrfs_inode *inode, u64 dirid)
3558 {
3559         struct btrfs_root *log;
3560         u64 index;
3561         int ret;
3562
3563         if (!inode_logged(trans, inode))
3564                 return 0;
3565
3566         ret = join_running_log_trans(root);
3567         if (ret)
3568                 return 0;
3569         log = root->log_root;
3570         mutex_lock(&inode->log_mutex);
3571
3572         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3573                                   dirid, &index);
3574         mutex_unlock(&inode->log_mutex);
3575         if (ret == -ENOSPC) {
3576                 btrfs_set_log_full_commit(trans);
3577                 ret = 0;
3578         } else if (ret < 0 && ret != -ENOENT)
3579                 btrfs_abort_transaction(trans, ret);
3580         btrfs_end_log_trans(root);
3581
3582         return ret;
3583 }
3584
3585 /*
3586  * creates a range item in the log for 'dirid'.  first_offset and
3587  * last_offset tell us which parts of the key space the log should
3588  * be considered authoritative for.
3589  */
3590 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3591                                        struct btrfs_root *log,
3592                                        struct btrfs_path *path,
3593                                        int key_type, u64 dirid,
3594                                        u64 first_offset, u64 last_offset)
3595 {
3596         int ret;
3597         struct btrfs_key key;
3598         struct btrfs_dir_log_item *item;
3599
3600         key.objectid = dirid;
3601         key.offset = first_offset;
3602         if (key_type == BTRFS_DIR_ITEM_KEY)
3603                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3604         else
3605                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3606         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3607         if (ret)
3608                 return ret;
3609
3610         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3611                               struct btrfs_dir_log_item);
3612         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3613         btrfs_mark_buffer_dirty(path->nodes[0]);
3614         btrfs_release_path(path);
3615         return 0;
3616 }
3617
3618 /*
3619  * log all the items included in the current transaction for a given
3620  * directory.  This also creates the range items in the log tree required
3621  * to replay anything deleted before the fsync
3622  */
3623 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3624                           struct btrfs_root *root, struct btrfs_inode *inode,
3625                           struct btrfs_path *path,
3626                           struct btrfs_path *dst_path, int key_type,
3627                           struct btrfs_log_ctx *ctx,
3628                           u64 min_offset, u64 *last_offset_ret)
3629 {
3630         struct btrfs_key min_key;
3631         struct btrfs_root *log = root->log_root;
3632         struct extent_buffer *src;
3633         int err = 0;
3634         int ret;
3635         int i;
3636         int nritems;
3637         u64 first_offset = min_offset;
3638         u64 last_offset = (u64)-1;
3639         u64 ino = btrfs_ino(inode);
3640
3641         log = root->log_root;
3642
3643         min_key.objectid = ino;
3644         min_key.type = key_type;
3645         min_key.offset = min_offset;
3646
3647         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3648
3649         /*
3650          * we didn't find anything from this transaction, see if there
3651          * is anything at all
3652          */
3653         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3654                 min_key.objectid = ino;
3655                 min_key.type = key_type;
3656                 min_key.offset = (u64)-1;
3657                 btrfs_release_path(path);
3658                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3659                 if (ret < 0) {
3660                         btrfs_release_path(path);
3661                         return ret;
3662                 }
3663                 ret = btrfs_previous_item(root, path, ino, key_type);
3664
3665                 /* if ret == 0 there are items for this type,
3666                  * create a range to tell us the last key of this type.
3667                  * otherwise, there are no items in this directory after
3668                  * *min_offset, and we create a range to indicate that.
3669                  */
3670                 if (ret == 0) {
3671                         struct btrfs_key tmp;
3672                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3673                                               path->slots[0]);
3674                         if (key_type == tmp.type)
3675                                 first_offset = max(min_offset, tmp.offset) + 1;
3676                 }
3677                 goto done;
3678         }
3679
3680         /* go backward to find any previous key */
3681         ret = btrfs_previous_item(root, path, ino, key_type);
3682         if (ret == 0) {
3683                 struct btrfs_key tmp;
3684                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3685                 if (key_type == tmp.type) {
3686                         first_offset = tmp.offset;
3687                         ret = overwrite_item(trans, log, dst_path,
3688                                              path->nodes[0], path->slots[0],
3689                                              &tmp);
3690                         if (ret) {
3691                                 err = ret;
3692                                 goto done;
3693                         }
3694                 }
3695         }
3696         btrfs_release_path(path);
3697
3698         /*
3699          * Find the first key from this transaction again.  See the note for
3700          * log_new_dir_dentries, if we're logging a directory recursively we
3701          * won't be holding its i_mutex, which means we can modify the directory
3702          * while we're logging it.  If we remove an entry between our first
3703          * search and this search we'll not find the key again and can just
3704          * bail.
3705          */
3706 search:
3707         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3708         if (ret != 0)
3709                 goto done;
3710
3711         /*
3712          * we have a block from this transaction, log every item in it
3713          * from our directory
3714          */
3715         while (1) {
3716                 struct btrfs_key tmp;
3717                 src = path->nodes[0];
3718                 nritems = btrfs_header_nritems(src);
3719                 for (i = path->slots[0]; i < nritems; i++) {
3720                         struct btrfs_dir_item *di;
3721
3722                         btrfs_item_key_to_cpu(src, &min_key, i);
3723
3724                         if (min_key.objectid != ino || min_key.type != key_type)
3725                                 goto done;
3726
3727                         if (need_resched()) {
3728                                 btrfs_release_path(path);
3729                                 cond_resched();
3730                                 goto search;
3731                         }
3732
3733                         ret = overwrite_item(trans, log, dst_path, src, i,
3734                                              &min_key);
3735                         if (ret) {
3736                                 err = ret;
3737                                 goto done;
3738                         }
3739
3740                         /*
3741                          * We must make sure that when we log a directory entry,
3742                          * the corresponding inode, after log replay, has a
3743                          * matching link count. For example:
3744                          *
3745                          * touch foo
3746                          * mkdir mydir
3747                          * sync
3748                          * ln foo mydir/bar
3749                          * xfs_io -c "fsync" mydir
3750                          * <crash>
3751                          * <mount fs and log replay>
3752                          *
3753                          * Would result in a fsync log that when replayed, our
3754                          * file inode would have a link count of 1, but we get
3755                          * two directory entries pointing to the same inode.
3756                          * After removing one of the names, it would not be
3757                          * possible to remove the other name, which resulted
3758                          * always in stale file handle errors, and would not
3759                          * be possible to rmdir the parent directory, since
3760                          * its i_size could never decrement to the value
3761                          * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3762                          */
3763                         di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3764                         btrfs_dir_item_key_to_cpu(src, di, &tmp);
3765                         if (ctx &&
3766                             (btrfs_dir_transid(src, di) == trans->transid ||
3767                              btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3768                             tmp.type != BTRFS_ROOT_ITEM_KEY)
3769                                 ctx->log_new_dentries = true;
3770                 }
3771                 path->slots[0] = nritems;
3772
3773                 /*
3774                  * look ahead to the next item and see if it is also
3775                  * from this directory and from this transaction
3776                  */
3777                 ret = btrfs_next_leaf(root, path);
3778                 if (ret) {
3779                         if (ret == 1)
3780                                 last_offset = (u64)-1;
3781                         else
3782                                 err = ret;
3783                         goto done;
3784                 }
3785                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3786                 if (tmp.objectid != ino || tmp.type != key_type) {
3787                         last_offset = (u64)-1;
3788                         goto done;
3789                 }
3790                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3791                         ret = overwrite_item(trans, log, dst_path,
3792                                              path->nodes[0], path->slots[0],
3793                                              &tmp);
3794                         if (ret)
3795                                 err = ret;
3796                         else
3797                                 last_offset = tmp.offset;
3798                         goto done;
3799                 }
3800         }
3801 done:
3802         btrfs_release_path(path);
3803         btrfs_release_path(dst_path);
3804
3805         if (err == 0) {
3806                 *last_offset_ret = last_offset;
3807                 /*
3808                  * insert the log range keys to indicate where the log
3809                  * is valid
3810                  */
3811                 ret = insert_dir_log_key(trans, log, path, key_type,
3812                                          ino, first_offset, last_offset);
3813                 if (ret)
3814                         err = ret;
3815         }
3816         return err;
3817 }
3818
3819 /*
3820  * logging directories is very similar to logging inodes, We find all the items
3821  * from the current transaction and write them to the log.
3822  *
3823  * The recovery code scans the directory in the subvolume, and if it finds a
3824  * key in the range logged that is not present in the log tree, then it means
3825  * that dir entry was unlinked during the transaction.
3826  *
3827  * In order for that scan to work, we must include one key smaller than
3828  * the smallest logged by this transaction and one key larger than the largest
3829  * key logged by this transaction.
3830  */
3831 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3832                           struct btrfs_root *root, struct btrfs_inode *inode,
3833                           struct btrfs_path *path,
3834                           struct btrfs_path *dst_path,
3835                           struct btrfs_log_ctx *ctx)
3836 {
3837         u64 min_key;
3838         u64 max_key;
3839         int ret;
3840         int key_type = BTRFS_DIR_ITEM_KEY;
3841
3842 again:
3843         min_key = 0;
3844         max_key = 0;
3845         while (1) {
3846                 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3847                                 ctx, min_key, &max_key);
3848                 if (ret)
3849                         return ret;
3850                 if (max_key == (u64)-1)
3851                         break;
3852                 min_key = max_key + 1;
3853         }
3854
3855         if (key_type == BTRFS_DIR_ITEM_KEY) {
3856                 key_type = BTRFS_DIR_INDEX_KEY;
3857                 goto again;
3858         }
3859         return 0;
3860 }
3861
3862 /*
3863  * a helper function to drop items from the log before we relog an
3864  * inode.  max_key_type indicates the highest item type to remove.
3865  * This cannot be run for file data extents because it does not
3866  * free the extents they point to.
3867  */
3868 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3869                                   struct btrfs_root *log,
3870                                   struct btrfs_path *path,
3871                                   u64 objectid, int max_key_type)
3872 {
3873         int ret;
3874         struct btrfs_key key;
3875         struct btrfs_key found_key;
3876         int start_slot;
3877
3878         key.objectid = objectid;
3879         key.type = max_key_type;
3880         key.offset = (u64)-1;
3881
3882         while (1) {
3883                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3884                 BUG_ON(ret == 0); /* Logic error */
3885                 if (ret < 0)
3886                         break;
3887
3888                 if (path->slots[0] == 0)
3889                         break;
3890
3891                 path->slots[0]--;
3892                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3893                                       path->slots[0]);
3894
3895                 if (found_key.objectid != objectid)
3896                         break;
3897
3898                 found_key.offset = 0;
3899                 found_key.type = 0;
3900                 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3901                 if (ret < 0)
3902                         break;
3903
3904                 ret = btrfs_del_items(trans, log, path, start_slot,
3905                                       path->slots[0] - start_slot + 1);
3906                 /*
3907                  * If start slot isn't 0 then we don't need to re-search, we've
3908                  * found the last guy with the objectid in this tree.
3909                  */
3910                 if (ret || start_slot != 0)
3911                         break;
3912                 btrfs_release_path(path);
3913         }
3914         btrfs_release_path(path);
3915         if (ret > 0)
3916                 ret = 0;
3917         return ret;
3918 }
3919
3920 static void fill_inode_item(struct btrfs_trans_handle *trans,
3921                             struct extent_buffer *leaf,
3922                             struct btrfs_inode_item *item,
3923                             struct inode *inode, int log_inode_only,
3924                             u64 logged_isize)
3925 {
3926         struct btrfs_map_token token;
3927         u64 flags;
3928
3929         btrfs_init_map_token(&token, leaf);
3930
3931         if (log_inode_only) {
3932                 /* set the generation to zero so the recover code
3933                  * can tell the difference between an logging
3934                  * just to say 'this inode exists' and a logging
3935                  * to say 'update this inode with these values'
3936                  */
3937                 btrfs_set_token_inode_generation(&token, item, 0);
3938                 btrfs_set_token_inode_size(&token, item, logged_isize);
3939         } else {
3940                 btrfs_set_token_inode_generation(&token, item,
3941                                                  BTRFS_I(inode)->generation);
3942                 btrfs_set_token_inode_size(&token, item, inode->i_size);
3943         }
3944
3945         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3946         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3947         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3948         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3949
3950         btrfs_set_token_timespec_sec(&token, &item->atime,
3951                                      inode->i_atime.tv_sec);
3952         btrfs_set_token_timespec_nsec(&token, &item->atime,
3953                                       inode->i_atime.tv_nsec);
3954
3955         btrfs_set_token_timespec_sec(&token, &item->mtime,
3956                                      inode->i_mtime.tv_sec);
3957         btrfs_set_token_timespec_nsec(&token, &item->mtime,
3958                                       inode->i_mtime.tv_nsec);
3959
3960         btrfs_set_token_timespec_sec(&token, &item->ctime,
3961                                      inode->i_ctime.tv_sec);
3962         btrfs_set_token_timespec_nsec(&token, &item->ctime,
3963                                       inode->i_ctime.tv_nsec);
3964
3965         /*
3966          * We do not need to set the nbytes field, in fact during a fast fsync
3967          * its value may not even be correct, since a fast fsync does not wait
3968          * for ordered extent completion, which is where we update nbytes, it
3969          * only waits for writeback to complete. During log replay as we find
3970          * file extent items and replay them, we adjust the nbytes field of the
3971          * inode item in subvolume tree as needed (see overwrite_item()).
3972          */
3973
3974         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3975         btrfs_set_token_inode_transid(&token, item, trans->transid);
3976         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3977         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3978                                           BTRFS_I(inode)->ro_flags);
3979         btrfs_set_token_inode_flags(&token, item, flags);
3980         btrfs_set_token_inode_block_group(&token, item, 0);
3981 }
3982
3983 static int log_inode_item(struct btrfs_trans_handle *trans,
3984                           struct btrfs_root *log, struct btrfs_path *path,
3985                           struct btrfs_inode *inode, bool inode_item_dropped)
3986 {
3987         struct btrfs_inode_item *inode_item;
3988         int ret;
3989
3990         /*
3991          * If we are doing a fast fsync and the inode was logged before in the
3992          * current transaction, then we know the inode was previously logged and
3993          * it exists in the log tree. For performance reasons, in this case use
3994          * btrfs_search_slot() directly with ins_len set to 0 so that we never
3995          * attempt a write lock on the leaf's parent, which adds unnecessary lock
3996          * contention in case there are concurrent fsyncs for other inodes of the
3997          * same subvolume. Using btrfs_insert_empty_item() when the inode item
3998          * already exists can also result in unnecessarily splitting a leaf.
3999          */
4000         if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4001                 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4002                 ASSERT(ret <= 0);
4003                 if (ret > 0)
4004                         ret = -ENOENT;
4005         } else {
4006                 /*
4007                  * This means it is the first fsync in the current transaction,
4008                  * so the inode item is not in the log and we need to insert it.
4009                  * We can never get -EEXIST because we are only called for a fast
4010                  * fsync and in case an inode eviction happens after the inode was
4011                  * logged before in the current transaction, when we load again
4012                  * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4013                  * flags and set ->logged_trans to 0.
4014                  */
4015                 ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4016                                               sizeof(*inode_item));
4017                 ASSERT(ret != -EEXIST);
4018         }
4019         if (ret)
4020                 return ret;
4021         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4022                                     struct btrfs_inode_item);
4023         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4024                         0, 0);
4025         btrfs_release_path(path);
4026         return 0;
4027 }
4028
4029 static int log_csums(struct btrfs_trans_handle *trans,
4030                      struct btrfs_inode *inode,
4031                      struct btrfs_root *log_root,
4032                      struct btrfs_ordered_sum *sums)
4033 {
4034         const u64 lock_end = sums->bytenr + sums->len - 1;
4035         struct extent_state *cached_state = NULL;
4036         int ret;
4037
4038         /*
4039          * If this inode was not used for reflink operations in the current
4040          * transaction with new extents, then do the fast path, no need to
4041          * worry about logging checksum items with overlapping ranges.
4042          */
4043         if (inode->last_reflink_trans < trans->transid)
4044                 return btrfs_csum_file_blocks(trans, log_root, sums);
4045
4046         /*
4047          * Serialize logging for checksums. This is to avoid racing with the
4048          * same checksum being logged by another task that is logging another
4049          * file which happens to refer to the same extent as well. Such races
4050          * can leave checksum items in the log with overlapping ranges.
4051          */
4052         ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
4053                                lock_end, &cached_state);
4054         if (ret)
4055                 return ret;
4056         /*
4057          * Due to extent cloning, we might have logged a csum item that covers a
4058          * subrange of a cloned extent, and later we can end up logging a csum
4059          * item for a larger subrange of the same extent or the entire range.
4060          * This would leave csum items in the log tree that cover the same range
4061          * and break the searches for checksums in the log tree, resulting in
4062          * some checksums missing in the fs/subvolume tree. So just delete (or
4063          * trim and adjust) any existing csum items in the log for this range.
4064          */
4065         ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4066         if (!ret)
4067                 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4068
4069         unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
4070                              &cached_state);
4071
4072         return ret;
4073 }
4074
4075 static noinline int copy_items(struct btrfs_trans_handle *trans,
4076                                struct btrfs_inode *inode,
4077                                struct btrfs_path *dst_path,
4078                                struct btrfs_path *src_path,
4079                                int start_slot, int nr, int inode_only,
4080                                u64 logged_isize)
4081 {
4082         struct btrfs_fs_info *fs_info = trans->fs_info;
4083         unsigned long src_offset;
4084         unsigned long dst_offset;
4085         struct btrfs_root *log = inode->root->log_root;
4086         struct btrfs_file_extent_item *extent;
4087         struct btrfs_inode_item *inode_item;
4088         struct extent_buffer *src = src_path->nodes[0];
4089         int ret;
4090         struct btrfs_key *ins_keys;
4091         u32 *ins_sizes;
4092         char *ins_data;
4093         int i;
4094         struct list_head ordered_sums;
4095         int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4096
4097         INIT_LIST_HEAD(&ordered_sums);
4098
4099         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4100                            nr * sizeof(u32), GFP_NOFS);
4101         if (!ins_data)
4102                 return -ENOMEM;
4103
4104         ins_sizes = (u32 *)ins_data;
4105         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4106
4107         for (i = 0; i < nr; i++) {
4108                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
4109                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4110         }
4111         ret = btrfs_insert_empty_items(trans, log, dst_path,
4112                                        ins_keys, ins_sizes, nr);
4113         if (ret) {
4114                 kfree(ins_data);
4115                 return ret;
4116         }
4117
4118         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4119                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4120                                                    dst_path->slots[0]);
4121
4122                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4123
4124                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4125                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
4126                                                     dst_path->slots[0],
4127                                                     struct btrfs_inode_item);
4128                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
4129                                         &inode->vfs_inode,
4130                                         inode_only == LOG_INODE_EXISTS,
4131                                         logged_isize);
4132                 } else {
4133                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4134                                            src_offset, ins_sizes[i]);
4135                 }
4136
4137                 /* take a reference on file data extents so that truncates
4138                  * or deletes of this inode don't have to relog the inode
4139                  * again
4140                  */
4141                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4142                     !skip_csum) {
4143                         int found_type;
4144                         extent = btrfs_item_ptr(src, start_slot + i,
4145                                                 struct btrfs_file_extent_item);
4146
4147                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
4148                                 continue;
4149
4150                         found_type = btrfs_file_extent_type(src, extent);
4151                         if (found_type == BTRFS_FILE_EXTENT_REG) {
4152                                 u64 ds, dl, cs, cl;
4153                                 ds = btrfs_file_extent_disk_bytenr(src,
4154                                                                 extent);
4155                                 /* ds == 0 is a hole */
4156                                 if (ds == 0)
4157                                         continue;
4158
4159                                 dl = btrfs_file_extent_disk_num_bytes(src,
4160                                                                 extent);
4161                                 cs = btrfs_file_extent_offset(src, extent);
4162                                 cl = btrfs_file_extent_num_bytes(src,
4163                                                                 extent);
4164                                 if (btrfs_file_extent_compression(src,
4165                                                                   extent)) {
4166                                         cs = 0;
4167                                         cl = dl;
4168                                 }
4169
4170                                 ret = btrfs_lookup_csums_range(
4171                                                 fs_info->csum_root,
4172                                                 ds + cs, ds + cs + cl - 1,
4173                                                 &ordered_sums, 0);
4174                                 if (ret)
4175                                         break;
4176                         }
4177                 }
4178         }
4179
4180         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4181         btrfs_release_path(dst_path);
4182         kfree(ins_data);
4183
4184         /*
4185          * we have to do this after the loop above to avoid changing the
4186          * log tree while trying to change the log tree.
4187          */
4188         while (!list_empty(&ordered_sums)) {
4189                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4190                                                    struct btrfs_ordered_sum,
4191                                                    list);
4192                 if (!ret)
4193                         ret = log_csums(trans, inode, log, sums);
4194                 list_del(&sums->list);
4195                 kfree(sums);
4196         }
4197
4198         return ret;
4199 }
4200
4201 static int extent_cmp(void *priv, const struct list_head *a,
4202                       const struct list_head *b)
4203 {
4204         const struct extent_map *em1, *em2;
4205
4206         em1 = list_entry(a, struct extent_map, list);
4207         em2 = list_entry(b, struct extent_map, list);
4208
4209         if (em1->start < em2->start)
4210                 return -1;
4211         else if (em1->start > em2->start)
4212                 return 1;
4213         return 0;
4214 }
4215
4216 static int log_extent_csums(struct btrfs_trans_handle *trans,
4217                             struct btrfs_inode *inode,
4218                             struct btrfs_root *log_root,
4219                             const struct extent_map *em,
4220                             struct btrfs_log_ctx *ctx)
4221 {
4222         struct btrfs_ordered_extent *ordered;
4223         u64 csum_offset;
4224         u64 csum_len;
4225         u64 mod_start = em->mod_start;
4226         u64 mod_len = em->mod_len;
4227         LIST_HEAD(ordered_sums);
4228         int ret = 0;
4229
4230         if (inode->flags & BTRFS_INODE_NODATASUM ||
4231             test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4232             em->block_start == EXTENT_MAP_HOLE)
4233                 return 0;
4234
4235         list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4236                 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4237                 const u64 mod_end = mod_start + mod_len;
4238                 struct btrfs_ordered_sum *sums;
4239
4240                 if (mod_len == 0)
4241                         break;
4242
4243                 if (ordered_end <= mod_start)
4244                         continue;
4245                 if (mod_end <= ordered->file_offset)
4246                         break;
4247
4248                 /*
4249                  * We are going to copy all the csums on this ordered extent, so
4250                  * go ahead and adjust mod_start and mod_len in case this ordered
4251                  * extent has already been logged.
4252                  */
4253                 if (ordered->file_offset > mod_start) {
4254                         if (ordered_end >= mod_end)
4255                                 mod_len = ordered->file_offset - mod_start;
4256                         /*
4257                          * If we have this case
4258                          *
4259                          * |--------- logged extent ---------|
4260                          *       |----- ordered extent ----|
4261                          *
4262                          * Just don't mess with mod_start and mod_len, we'll
4263                          * just end up logging more csums than we need and it
4264                          * will be ok.
4265                          */
4266                 } else {
4267                         if (ordered_end < mod_end) {
4268                                 mod_len = mod_end - ordered_end;
4269                                 mod_start = ordered_end;
4270                         } else {
4271                                 mod_len = 0;
4272                         }
4273                 }
4274
4275                 /*
4276                  * To keep us from looping for the above case of an ordered
4277                  * extent that falls inside of the logged extent.
4278                  */
4279                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4280                         continue;
4281
4282                 list_for_each_entry(sums, &ordered->list, list) {
4283                         ret = log_csums(trans, inode, log_root, sums);
4284                         if (ret)
4285                                 return ret;
4286                 }
4287         }
4288
4289         /* We're done, found all csums in the ordered extents. */
4290         if (mod_len == 0)
4291                 return 0;
4292
4293         /* If we're compressed we have to save the entire range of csums. */
4294         if (em->compress_type) {
4295                 csum_offset = 0;
4296                 csum_len = max(em->block_len, em->orig_block_len);
4297         } else {
4298                 csum_offset = mod_start - em->start;
4299                 csum_len = mod_len;
4300         }
4301
4302         /* block start is already adjusted for the file extent offset. */
4303         ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4304                                        em->block_start + csum_offset,
4305                                        em->block_start + csum_offset +
4306                                        csum_len - 1, &ordered_sums, 0);
4307         if (ret)
4308                 return ret;
4309
4310         while (!list_empty(&ordered_sums)) {
4311                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4312                                                    struct btrfs_ordered_sum,
4313                                                    list);
4314                 if (!ret)
4315                         ret = log_csums(trans, inode, log_root, sums);
4316                 list_del(&sums->list);
4317                 kfree(sums);
4318         }
4319
4320         return ret;
4321 }
4322
4323 static int log_one_extent(struct btrfs_trans_handle *trans,
4324                           struct btrfs_inode *inode, struct btrfs_root *root,
4325                           const struct extent_map *em,
4326                           struct btrfs_path *path,
4327                           struct btrfs_log_ctx *ctx)
4328 {
4329         struct btrfs_drop_extents_args drop_args = { 0 };
4330         struct btrfs_root *log = root->log_root;
4331         struct btrfs_file_extent_item *fi;
4332         struct extent_buffer *leaf;
4333         struct btrfs_map_token token;
4334         struct btrfs_key key;
4335         u64 extent_offset = em->start - em->orig_start;
4336         u64 block_len;
4337         int ret;
4338
4339         ret = log_extent_csums(trans, inode, log, em, ctx);
4340         if (ret)
4341                 return ret;
4342
4343         drop_args.path = path;
4344         drop_args.start = em->start;
4345         drop_args.end = em->start + em->len;
4346         drop_args.replace_extent = true;
4347         drop_args.extent_item_size = sizeof(*fi);
4348         ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4349         if (ret)
4350                 return ret;
4351
4352         if (!drop_args.extent_inserted) {
4353                 key.objectid = btrfs_ino(inode);
4354                 key.type = BTRFS_EXTENT_DATA_KEY;
4355                 key.offset = em->start;
4356
4357                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4358                                               sizeof(*fi));
4359                 if (ret)
4360                         return ret;
4361         }
4362         leaf = path->nodes[0];
4363         btrfs_init_map_token(&token, leaf);
4364         fi = btrfs_item_ptr(leaf, path->slots[0],
4365                             struct btrfs_file_extent_item);
4366
4367         btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4368         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4369                 btrfs_set_token_file_extent_type(&token, fi,
4370                                                  BTRFS_FILE_EXTENT_PREALLOC);
4371         else
4372                 btrfs_set_token_file_extent_type(&token, fi,
4373                                                  BTRFS_FILE_EXTENT_REG);
4374
4375         block_len = max(em->block_len, em->orig_block_len);
4376         if (em->compress_type != BTRFS_COMPRESS_NONE) {
4377                 btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4378                                                         em->block_start);
4379                 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4380         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4381                 btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4382                                                         em->block_start -
4383                                                         extent_offset);
4384                 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4385         } else {
4386                 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4387                 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4388         }
4389
4390         btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4391         btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4392         btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4393         btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4394         btrfs_set_token_file_extent_encryption(&token, fi, 0);
4395         btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4396         btrfs_mark_buffer_dirty(leaf);
4397
4398         btrfs_release_path(path);
4399
4400         return ret;
4401 }
4402
4403 /*
4404  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4405  * lose them after doing a fast fsync and replaying the log. We scan the
4406  * subvolume's root instead of iterating the inode's extent map tree because
4407  * otherwise we can log incorrect extent items based on extent map conversion.
4408  * That can happen due to the fact that extent maps are merged when they
4409  * are not in the extent map tree's list of modified extents.
4410  */
4411 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4412                                       struct btrfs_inode *inode,
4413                                       struct btrfs_path *path)
4414 {
4415         struct btrfs_root *root = inode->root;
4416         struct btrfs_key key;
4417         const u64 i_size = i_size_read(&inode->vfs_inode);
4418         const u64 ino = btrfs_ino(inode);
4419         struct btrfs_path *dst_path = NULL;
4420         bool dropped_extents = false;
4421         u64 truncate_offset = i_size;
4422         struct extent_buffer *leaf;
4423         int slot;
4424         int ins_nr = 0;
4425         int start_slot;
4426         int ret;
4427
4428         if (!(inode->flags & BTRFS_INODE_PREALLOC))
4429                 return 0;
4430
4431         key.objectid = ino;
4432         key.type = BTRFS_EXTENT_DATA_KEY;
4433         key.offset = i_size;
4434         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4435         if (ret < 0)
4436                 goto out;
4437
4438         /*
4439          * We must check if there is a prealloc extent that starts before the
4440          * i_size and crosses the i_size boundary. This is to ensure later we
4441          * truncate down to the end of that extent and not to the i_size, as
4442          * otherwise we end up losing part of the prealloc extent after a log
4443          * replay and with an implicit hole if there is another prealloc extent
4444          * that starts at an offset beyond i_size.
4445          */
4446         ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4447         if (ret < 0)
4448                 goto out;
4449
4450         if (ret == 0) {
4451                 struct btrfs_file_extent_item *ei;
4452
4453                 leaf = path->nodes[0];
4454                 slot = path->slots[0];
4455                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4456
4457                 if (btrfs_file_extent_type(leaf, ei) ==
4458                     BTRFS_FILE_EXTENT_PREALLOC) {
4459                         u64 extent_end;
4460
4461                         btrfs_item_key_to_cpu(leaf, &key, slot);
4462                         extent_end = key.offset +
4463                                 btrfs_file_extent_num_bytes(leaf, ei);
4464
4465                         if (extent_end > i_size)
4466                                 truncate_offset = extent_end;
4467                 }
4468         } else {
4469                 ret = 0;
4470         }
4471
4472         while (true) {
4473                 leaf = path->nodes[0];
4474                 slot = path->slots[0];
4475
4476                 if (slot >= btrfs_header_nritems(leaf)) {
4477                         if (ins_nr > 0) {
4478                                 ret = copy_items(trans, inode, dst_path, path,
4479                                                  start_slot, ins_nr, 1, 0);
4480                                 if (ret < 0)
4481                                         goto out;
4482                                 ins_nr = 0;
4483                         }
4484                         ret = btrfs_next_leaf(root, path);
4485                         if (ret < 0)
4486                                 goto out;
4487                         if (ret > 0) {
4488                                 ret = 0;
4489                                 break;
4490                         }
4491                         continue;
4492                 }
4493
4494                 btrfs_item_key_to_cpu(leaf, &key, slot);
4495                 if (key.objectid > ino)
4496                         break;
4497                 if (WARN_ON_ONCE(key.objectid < ino) ||
4498                     key.type < BTRFS_EXTENT_DATA_KEY ||
4499                     key.offset < i_size) {
4500                         path->slots[0]++;
4501                         continue;
4502                 }
4503                 if (!dropped_extents) {
4504                         /*
4505                          * Avoid logging extent items logged in past fsync calls
4506                          * and leading to duplicate keys in the log tree.
4507                          */
4508                         do {
4509                                 ret = btrfs_truncate_inode_items(trans,
4510                                                          root->log_root,
4511                                                          inode, truncate_offset,
4512                                                          BTRFS_EXTENT_DATA_KEY,
4513                                                          NULL);
4514                         } while (ret == -EAGAIN);
4515                         if (ret)
4516                                 goto out;
4517                         dropped_extents = true;
4518                 }
4519                 if (ins_nr == 0)
4520                         start_slot = slot;
4521                 ins_nr++;
4522                 path->slots[0]++;
4523                 if (!dst_path) {
4524                         dst_path = btrfs_alloc_path();
4525                         if (!dst_path) {
4526                                 ret = -ENOMEM;
4527                                 goto out;
4528                         }
4529                 }
4530         }
4531         if (ins_nr > 0)
4532                 ret = copy_items(trans, inode, dst_path, path,
4533                                  start_slot, ins_nr, 1, 0);
4534 out:
4535         btrfs_release_path(path);
4536         btrfs_free_path(dst_path);
4537         return ret;
4538 }
4539
4540 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4541                                      struct btrfs_root *root,
4542                                      struct btrfs_inode *inode,
4543                                      struct btrfs_path *path,
4544                                      struct btrfs_log_ctx *ctx)
4545 {
4546         struct btrfs_ordered_extent *ordered;
4547         struct btrfs_ordered_extent *tmp;
4548         struct extent_map *em, *n;
4549         struct list_head extents;
4550         struct extent_map_tree *tree = &inode->extent_tree;
4551         int ret = 0;
4552         int num = 0;
4553
4554         INIT_LIST_HEAD(&extents);
4555
4556         write_lock(&tree->lock);
4557
4558         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4559                 list_del_init(&em->list);
4560                 /*
4561                  * Just an arbitrary number, this can be really CPU intensive
4562                  * once we start getting a lot of extents, and really once we
4563                  * have a bunch of extents we just want to commit since it will
4564                  * be faster.
4565                  */
4566                 if (++num > 32768) {
4567                         list_del_init(&tree->modified_extents);
4568                         ret = -EFBIG;
4569                         goto process;
4570                 }
4571
4572                 if (em->generation < trans->transid)
4573                         continue;
4574
4575                 /* We log prealloc extents beyond eof later. */
4576                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4577                     em->start >= i_size_read(&inode->vfs_inode))
4578                         continue;
4579
4580                 /* Need a ref to keep it from getting evicted from cache */
4581                 refcount_inc(&em->refs);
4582                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4583                 list_add_tail(&em->list, &extents);
4584                 num++;
4585         }
4586
4587         list_sort(NULL, &extents, extent_cmp);
4588 process:
4589         while (!list_empty(&extents)) {
4590                 em = list_entry(extents.next, struct extent_map, list);
4591
4592                 list_del_init(&em->list);
4593
4594                 /*
4595                  * If we had an error we just need to delete everybody from our
4596                  * private list.
4597                  */
4598                 if (ret) {
4599                         clear_em_logging(tree, em);
4600                         free_extent_map(em);
4601                         continue;
4602                 }
4603
4604                 write_unlock(&tree->lock);
4605
4606                 ret = log_one_extent(trans, inode, root, em, path, ctx);
4607                 write_lock(&tree->lock);
4608                 clear_em_logging(tree, em);
4609                 free_extent_map(em);
4610         }
4611         WARN_ON(!list_empty(&extents));
4612         write_unlock(&tree->lock);
4613
4614         btrfs_release_path(path);
4615         if (!ret)
4616                 ret = btrfs_log_prealloc_extents(trans, inode, path);
4617         if (ret)
4618                 return ret;
4619
4620         /*
4621          * We have logged all extents successfully, now make sure the commit of
4622          * the current transaction waits for the ordered extents to complete
4623          * before it commits and wipes out the log trees, otherwise we would
4624          * lose data if an ordered extents completes after the transaction
4625          * commits and a power failure happens after the transaction commit.
4626          */
4627         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4628                 list_del_init(&ordered->log_list);
4629                 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4630
4631                 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4632                         spin_lock_irq(&inode->ordered_tree.lock);
4633                         if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4634                                 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4635                                 atomic_inc(&trans->transaction->pending_ordered);
4636                         }
4637                         spin_unlock_irq(&inode->ordered_tree.lock);
4638                 }
4639                 btrfs_put_ordered_extent(ordered);
4640         }
4641
4642         return 0;
4643 }
4644
4645 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4646                              struct btrfs_path *path, u64 *size_ret)
4647 {
4648         struct btrfs_key key;
4649         int ret;
4650
4651         key.objectid = btrfs_ino(inode);
4652         key.type = BTRFS_INODE_ITEM_KEY;
4653         key.offset = 0;
4654
4655         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4656         if (ret < 0) {
4657                 return ret;
4658         } else if (ret > 0) {
4659                 *size_ret = 0;
4660         } else {
4661                 struct btrfs_inode_item *item;
4662
4663                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4664                                       struct btrfs_inode_item);
4665                 *size_ret = btrfs_inode_size(path->nodes[0], item);
4666                 /*
4667                  * If the in-memory inode's i_size is smaller then the inode
4668                  * size stored in the btree, return the inode's i_size, so
4669                  * that we get a correct inode size after replaying the log
4670                  * when before a power failure we had a shrinking truncate
4671                  * followed by addition of a new name (rename / new hard link).
4672                  * Otherwise return the inode size from the btree, to avoid
4673                  * data loss when replaying a log due to previously doing a
4674                  * write that expands the inode's size and logging a new name
4675                  * immediately after.
4676                  */
4677                 if (*size_ret > inode->vfs_inode.i_size)
4678                         *size_ret = inode->vfs_inode.i_size;
4679         }
4680
4681         btrfs_release_path(path);
4682         return 0;
4683 }
4684
4685 /*
4686  * At the moment we always log all xattrs. This is to figure out at log replay
4687  * time which xattrs must have their deletion replayed. If a xattr is missing
4688  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4689  * because if a xattr is deleted, the inode is fsynced and a power failure
4690  * happens, causing the log to be replayed the next time the fs is mounted,
4691  * we want the xattr to not exist anymore (same behaviour as other filesystems
4692  * with a journal, ext3/4, xfs, f2fs, etc).
4693  */
4694 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4695                                 struct btrfs_root *root,
4696                                 struct btrfs_inode *inode,
4697                                 struct btrfs_path *path,
4698                                 struct btrfs_path *dst_path)
4699 {
4700         int ret;
4701         struct btrfs_key key;
4702         const u64 ino = btrfs_ino(inode);
4703         int ins_nr = 0;
4704         int start_slot = 0;
4705         bool found_xattrs = false;
4706
4707         if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4708                 return 0;
4709
4710         key.objectid = ino;
4711         key.type = BTRFS_XATTR_ITEM_KEY;
4712         key.offset = 0;
4713
4714         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4715         if (ret < 0)
4716                 return ret;
4717
4718         while (true) {
4719                 int slot = path->slots[0];
4720                 struct extent_buffer *leaf = path->nodes[0];
4721                 int nritems = btrfs_header_nritems(leaf);
4722
4723                 if (slot >= nritems) {
4724                         if (ins_nr > 0) {
4725                                 ret = copy_items(trans, inode, dst_path, path,
4726                                                  start_slot, ins_nr, 1, 0);
4727                                 if (ret < 0)
4728                                         return ret;
4729                                 ins_nr = 0;
4730                         }
4731                         ret = btrfs_next_leaf(root, path);
4732                         if (ret < 0)
4733                                 return ret;
4734                         else if (ret > 0)
4735                                 break;
4736                         continue;
4737                 }
4738
4739                 btrfs_item_key_to_cpu(leaf, &key, slot);
4740                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4741                         break;
4742
4743                 if (ins_nr == 0)
4744                         start_slot = slot;
4745                 ins_nr++;
4746                 path->slots[0]++;
4747                 found_xattrs = true;
4748                 cond_resched();
4749         }
4750         if (ins_nr > 0) {
4751                 ret = copy_items(trans, inode, dst_path, path,
4752                                  start_slot, ins_nr, 1, 0);
4753                 if (ret < 0)
4754                         return ret;
4755         }
4756
4757         if (!found_xattrs)
4758                 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4759
4760         return 0;
4761 }
4762
4763 /*
4764  * When using the NO_HOLES feature if we punched a hole that causes the
4765  * deletion of entire leafs or all the extent items of the first leaf (the one
4766  * that contains the inode item and references) we may end up not processing
4767  * any extents, because there are no leafs with a generation matching the
4768  * current transaction that have extent items for our inode. So we need to find
4769  * if any holes exist and then log them. We also need to log holes after any
4770  * truncate operation that changes the inode's size.
4771  */
4772 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4773                            struct btrfs_root *root,
4774                            struct btrfs_inode *inode,
4775                            struct btrfs_path *path)
4776 {
4777         struct btrfs_fs_info *fs_info = root->fs_info;
4778         struct btrfs_key key;
4779         const u64 ino = btrfs_ino(inode);
4780         const u64 i_size = i_size_read(&inode->vfs_inode);
4781         u64 prev_extent_end = 0;
4782         int ret;
4783
4784         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4785                 return 0;
4786
4787         key.objectid = ino;
4788         key.type = BTRFS_EXTENT_DATA_KEY;
4789         key.offset = 0;
4790
4791         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4792         if (ret < 0)
4793                 return ret;
4794
4795         while (true) {
4796                 struct extent_buffer *leaf = path->nodes[0];
4797
4798                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4799                         ret = btrfs_next_leaf(root, path);
4800                         if (ret < 0)
4801                                 return ret;
4802                         if (ret > 0) {
4803                                 ret = 0;
4804                                 break;
4805                         }
4806                         leaf = path->nodes[0];
4807                 }
4808
4809                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4810                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4811                         break;
4812
4813                 /* We have a hole, log it. */
4814                 if (prev_extent_end < key.offset) {
4815                         const u64 hole_len = key.offset - prev_extent_end;
4816
4817                         /*
4818                          * Release the path to avoid deadlocks with other code
4819                          * paths that search the root while holding locks on
4820                          * leafs from the log root.
4821                          */
4822                         btrfs_release_path(path);
4823                         ret = btrfs_insert_file_extent(trans, root->log_root,
4824                                                        ino, prev_extent_end, 0,
4825                                                        0, hole_len, 0, hole_len,
4826                                                        0, 0, 0);
4827                         if (ret < 0)
4828                                 return ret;
4829
4830                         /*
4831                          * Search for the same key again in the root. Since it's
4832                          * an extent item and we are holding the inode lock, the
4833                          * key must still exist. If it doesn't just emit warning
4834                          * and return an error to fall back to a transaction
4835                          * commit.
4836                          */
4837                         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4838                         if (ret < 0)
4839                                 return ret;
4840                         if (WARN_ON(ret > 0))
4841                                 return -ENOENT;
4842                         leaf = path->nodes[0];
4843                 }
4844
4845                 prev_extent_end = btrfs_file_extent_end(path);
4846                 path->slots[0]++;
4847                 cond_resched();
4848         }
4849
4850         if (prev_extent_end < i_size) {
4851                 u64 hole_len;
4852
4853                 btrfs_release_path(path);
4854                 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4855                 ret = btrfs_insert_file_extent(trans, root->log_root,
4856                                                ino, prev_extent_end, 0, 0,
4857                                                hole_len, 0, hole_len,
4858                                                0, 0, 0);
4859                 if (ret < 0)
4860                         return ret;
4861         }
4862
4863         return 0;
4864 }
4865
4866 /*
4867  * When we are logging a new inode X, check if it doesn't have a reference that
4868  * matches the reference from some other inode Y created in a past transaction
4869  * and that was renamed in the current transaction. If we don't do this, then at
4870  * log replay time we can lose inode Y (and all its files if it's a directory):
4871  *
4872  * mkdir /mnt/x
4873  * echo "hello world" > /mnt/x/foobar
4874  * sync
4875  * mv /mnt/x /mnt/y
4876  * mkdir /mnt/x                 # or touch /mnt/x
4877  * xfs_io -c fsync /mnt/x
4878  * <power fail>
4879  * mount fs, trigger log replay
4880  *
4881  * After the log replay procedure, we would lose the first directory and all its
4882  * files (file foobar).
4883  * For the case where inode Y is not a directory we simply end up losing it:
4884  *
4885  * echo "123" > /mnt/foo
4886  * sync
4887  * mv /mnt/foo /mnt/bar
4888  * echo "abc" > /mnt/foo
4889  * xfs_io -c fsync /mnt/foo
4890  * <power fail>
4891  *
4892  * We also need this for cases where a snapshot entry is replaced by some other
4893  * entry (file or directory) otherwise we end up with an unreplayable log due to
4894  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4895  * if it were a regular entry:
4896  *
4897  * mkdir /mnt/x
4898  * btrfs subvolume snapshot /mnt /mnt/x/snap
4899  * btrfs subvolume delete /mnt/x/snap
4900  * rmdir /mnt/x
4901  * mkdir /mnt/x
4902  * fsync /mnt/x or fsync some new file inside it
4903  * <power fail>
4904  *
4905  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4906  * the same transaction.
4907  */
4908 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4909                                          const int slot,
4910                                          const struct btrfs_key *key,
4911                                          struct btrfs_inode *inode,
4912                                          u64 *other_ino, u64 *other_parent)
4913 {
4914         int ret;
4915         struct btrfs_path *search_path;
4916         char *name = NULL;
4917         u32 name_len = 0;
4918         u32 item_size = btrfs_item_size_nr(eb, slot);
4919         u32 cur_offset = 0;
4920         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4921
4922         search_path = btrfs_alloc_path();
4923         if (!search_path)
4924                 return -ENOMEM;
4925         search_path->search_commit_root = 1;
4926         search_path->skip_locking = 1;
4927
4928         while (cur_offset < item_size) {
4929                 u64 parent;
4930                 u32 this_name_len;
4931                 u32 this_len;
4932                 unsigned long name_ptr;
4933                 struct btrfs_dir_item *di;
4934
4935                 if (key->type == BTRFS_INODE_REF_KEY) {
4936                         struct btrfs_inode_ref *iref;
4937
4938                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4939                         parent = key->offset;
4940                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
4941                         name_ptr = (unsigned long)(iref + 1);
4942                         this_len = sizeof(*iref) + this_name_len;
4943                 } else {
4944                         struct btrfs_inode_extref *extref;
4945
4946                         extref = (struct btrfs_inode_extref *)(ptr +
4947                                                                cur_offset);
4948                         parent = btrfs_inode_extref_parent(eb, extref);
4949                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
4950                         name_ptr = (unsigned long)&extref->name;
4951                         this_len = sizeof(*extref) + this_name_len;
4952                 }
4953
4954                 if (this_name_len > name_len) {
4955                         char *new_name;
4956
4957                         new_name = krealloc(name, this_name_len, GFP_NOFS);
4958                         if (!new_name) {
4959                                 ret = -ENOMEM;
4960                                 goto out;
4961                         }
4962                         name_len = this_name_len;
4963                         name = new_name;
4964                 }
4965
4966                 read_extent_buffer(eb, name, name_ptr, this_name_len);
4967                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4968                                 parent, name, this_name_len, 0);
4969                 if (di && !IS_ERR(di)) {
4970                         struct btrfs_key di_key;
4971
4972                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4973                                                   di, &di_key);
4974                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4975                                 if (di_key.objectid != key->objectid) {
4976                                         ret = 1;
4977                                         *other_ino = di_key.objectid;
4978                                         *other_parent = parent;
4979                                 } else {
4980                                         ret = 0;
4981                                 }
4982                         } else {
4983                                 ret = -EAGAIN;
4984                         }
4985                         goto out;
4986                 } else if (IS_ERR(di)) {
4987                         ret = PTR_ERR(di);
4988                         goto out;
4989                 }
4990                 btrfs_release_path(search_path);
4991
4992                 cur_offset += this_len;
4993         }
4994         ret = 0;
4995 out:
4996         btrfs_free_path(search_path);
4997         kfree(name);
4998         return ret;
4999 }
5000
5001 struct btrfs_ino_list {
5002         u64 ino;
5003         u64 parent;
5004         struct list_head list;
5005 };
5006
5007 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5008                                   struct btrfs_root *root,
5009                                   struct btrfs_path *path,
5010                                   struct btrfs_log_ctx *ctx,
5011                                   u64 ino, u64 parent)
5012 {
5013         struct btrfs_ino_list *ino_elem;
5014         LIST_HEAD(inode_list);
5015         int ret = 0;
5016
5017         ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5018         if (!ino_elem)
5019                 return -ENOMEM;
5020         ino_elem->ino = ino;
5021         ino_elem->parent = parent;
5022         list_add_tail(&ino_elem->list, &inode_list);
5023
5024         while (!list_empty(&inode_list)) {
5025                 struct btrfs_fs_info *fs_info = root->fs_info;
5026                 struct btrfs_key key;
5027                 struct inode *inode;
5028
5029                 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
5030                                             list);
5031                 ino = ino_elem->ino;
5032                 parent = ino_elem->parent;
5033                 list_del(&ino_elem->list);
5034                 kfree(ino_elem);
5035                 if (ret)
5036                         continue;
5037
5038                 btrfs_release_path(path);
5039
5040                 inode = btrfs_iget(fs_info->sb, ino, root);
5041                 /*
5042                  * If the other inode that had a conflicting dir entry was
5043                  * deleted in the current transaction, we need to log its parent
5044                  * directory.
5045                  */
5046                 if (IS_ERR(inode)) {
5047                         ret = PTR_ERR(inode);
5048                         if (ret == -ENOENT) {
5049                                 inode = btrfs_iget(fs_info->sb, parent, root);
5050                                 if (IS_ERR(inode)) {
5051                                         ret = PTR_ERR(inode);
5052                                 } else {
5053                                         ret = btrfs_log_inode(trans, root,
5054                                                       BTRFS_I(inode),
5055                                                       LOG_OTHER_INODE_ALL,
5056                                                       ctx);
5057                                         btrfs_add_delayed_iput(inode);
5058                                 }
5059                         }
5060                         continue;
5061                 }
5062                 /*
5063                  * If the inode was already logged skip it - otherwise we can
5064                  * hit an infinite loop. Example:
5065                  *
5066                  * From the commit root (previous transaction) we have the
5067                  * following inodes:
5068                  *
5069                  * inode 257 a directory
5070                  * inode 258 with references "zz" and "zz_link" on inode 257
5071                  * inode 259 with reference "a" on inode 257
5072                  *
5073                  * And in the current (uncommitted) transaction we have:
5074                  *
5075                  * inode 257 a directory, unchanged
5076                  * inode 258 with references "a" and "a2" on inode 257
5077                  * inode 259 with reference "zz_link" on inode 257
5078                  * inode 261 with reference "zz" on inode 257
5079                  *
5080                  * When logging inode 261 the following infinite loop could
5081                  * happen if we don't skip already logged inodes:
5082                  *
5083                  * - we detect inode 258 as a conflicting inode, with inode 261
5084                  *   on reference "zz", and log it;
5085                  *
5086                  * - we detect inode 259 as a conflicting inode, with inode 258
5087                  *   on reference "a", and log it;
5088                  *
5089                  * - we detect inode 258 as a conflicting inode, with inode 259
5090                  *   on reference "zz_link", and log it - again! After this we
5091                  *   repeat the above steps forever.
5092                  */
5093                 spin_lock(&BTRFS_I(inode)->lock);
5094                 /*
5095                  * Check the inode's logged_trans only instead of
5096                  * btrfs_inode_in_log(). This is because the last_log_commit of
5097                  * the inode is not updated when we only log that it exists (see
5098                  * btrfs_log_inode()).
5099                  */
5100                 if (BTRFS_I(inode)->logged_trans == trans->transid) {
5101                         spin_unlock(&BTRFS_I(inode)->lock);
5102                         btrfs_add_delayed_iput(inode);
5103                         continue;
5104                 }
5105                 spin_unlock(&BTRFS_I(inode)->lock);
5106                 /*
5107                  * We are safe logging the other inode without acquiring its
5108                  * lock as long as we log with the LOG_INODE_EXISTS mode. We
5109                  * are safe against concurrent renames of the other inode as
5110                  * well because during a rename we pin the log and update the
5111                  * log with the new name before we unpin it.
5112                  */
5113                 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5114                                       LOG_OTHER_INODE, ctx);
5115                 if (ret) {
5116                         btrfs_add_delayed_iput(inode);
5117                         continue;
5118                 }
5119
5120                 key.objectid = ino;
5121                 key.type = BTRFS_INODE_REF_KEY;
5122                 key.offset = 0;
5123                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5124                 if (ret < 0) {
5125                         btrfs_add_delayed_iput(inode);
5126                         continue;
5127                 }
5128
5129                 while (true) {
5130                         struct extent_buffer *leaf = path->nodes[0];
5131                         int slot = path->slots[0];
5132                         u64 other_ino = 0;
5133                         u64 other_parent = 0;
5134
5135                         if (slot >= btrfs_header_nritems(leaf)) {
5136                                 ret = btrfs_next_leaf(root, path);
5137                                 if (ret < 0) {
5138                                         break;
5139                                 } else if (ret > 0) {
5140                                         ret = 0;
5141                                         break;
5142                                 }
5143                                 continue;
5144                         }
5145
5146                         btrfs_item_key_to_cpu(leaf, &key, slot);
5147                         if (key.objectid != ino ||
5148                             (key.type != BTRFS_INODE_REF_KEY &&
5149                              key.type != BTRFS_INODE_EXTREF_KEY)) {
5150                                 ret = 0;
5151                                 break;
5152                         }
5153
5154                         ret = btrfs_check_ref_name_override(leaf, slot, &key,
5155                                         BTRFS_I(inode), &other_ino,
5156                                         &other_parent);
5157                         if (ret < 0)
5158                                 break;
5159                         if (ret > 0) {
5160                                 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5161                                 if (!ino_elem) {
5162                                         ret = -ENOMEM;
5163                                         break;
5164                                 }
5165                                 ino_elem->ino = other_ino;
5166                                 ino_elem->parent = other_parent;
5167                                 list_add_tail(&ino_elem->list, &inode_list);
5168                                 ret = 0;
5169                         }
5170                         path->slots[0]++;
5171                 }
5172                 btrfs_add_delayed_iput(inode);
5173         }
5174
5175         return ret;
5176 }
5177
5178 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5179                                    struct btrfs_inode *inode,
5180                                    struct btrfs_key *min_key,
5181                                    const struct btrfs_key *max_key,
5182                                    struct btrfs_path *path,
5183                                    struct btrfs_path *dst_path,
5184                                    const u64 logged_isize,
5185                                    const bool recursive_logging,
5186                                    const int inode_only,
5187                                    struct btrfs_log_ctx *ctx,
5188                                    bool *need_log_inode_item)
5189 {
5190         struct btrfs_root *root = inode->root;
5191         int ins_start_slot = 0;
5192         int ins_nr = 0;
5193         int ret;
5194
5195         while (1) {
5196                 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5197                 if (ret < 0)
5198                         return ret;
5199                 if (ret > 0) {
5200                         ret = 0;
5201                         break;
5202                 }
5203 again:
5204                 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5205                 if (min_key->objectid != max_key->objectid)
5206                         break;
5207                 if (min_key->type > max_key->type)
5208                         break;
5209
5210                 if (min_key->type == BTRFS_INODE_ITEM_KEY)
5211                         *need_log_inode_item = false;
5212
5213                 if ((min_key->type == BTRFS_INODE_REF_KEY ||
5214                      min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5215                     inode->generation == trans->transid &&
5216                     !recursive_logging) {
5217                         u64 other_ino = 0;
5218                         u64 other_parent = 0;
5219
5220                         ret = btrfs_check_ref_name_override(path->nodes[0],
5221                                         path->slots[0], min_key, inode,
5222                                         &other_ino, &other_parent);
5223                         if (ret < 0) {
5224                                 return ret;
5225                         } else if (ret > 0 && ctx &&
5226                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5227                                 if (ins_nr > 0) {
5228                                         ins_nr++;
5229                                 } else {
5230                                         ins_nr = 1;
5231                                         ins_start_slot = path->slots[0];
5232                                 }
5233                                 ret = copy_items(trans, inode, dst_path, path,
5234                                                  ins_start_slot, ins_nr,
5235                                                  inode_only, logged_isize);
5236                                 if (ret < 0)
5237                                         return ret;
5238                                 ins_nr = 0;
5239
5240                                 ret = log_conflicting_inodes(trans, root, path,
5241                                                 ctx, other_ino, other_parent);
5242                                 if (ret)
5243                                         return ret;
5244                                 btrfs_release_path(path);
5245                                 goto next_key;
5246                         }
5247                 }
5248
5249                 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5250                 if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5251                         if (ins_nr == 0)
5252                                 goto next_slot;
5253                         ret = copy_items(trans, inode, dst_path, path,
5254                                          ins_start_slot,
5255                                          ins_nr, inode_only, logged_isize);
5256                         if (ret < 0)
5257                                 return ret;
5258                         ins_nr = 0;
5259                         goto next_slot;
5260                 }
5261
5262                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5263                         ins_nr++;
5264                         goto next_slot;
5265                 } else if (!ins_nr) {
5266                         ins_start_slot = path->slots[0];
5267                         ins_nr = 1;
5268                         goto next_slot;
5269                 }
5270
5271                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5272                                  ins_nr, inode_only, logged_isize);
5273                 if (ret < 0)
5274                         return ret;
5275                 ins_nr = 1;
5276                 ins_start_slot = path->slots[0];
5277 next_slot:
5278                 path->slots[0]++;
5279                 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5280                         btrfs_item_key_to_cpu(path->nodes[0], min_key,
5281                                               path->slots[0]);
5282                         goto again;
5283                 }
5284                 if (ins_nr) {
5285                         ret = copy_items(trans, inode, dst_path, path,
5286                                          ins_start_slot, ins_nr, inode_only,
5287                                          logged_isize);
5288                         if (ret < 0)
5289                                 return ret;
5290                         ins_nr = 0;
5291                 }
5292                 btrfs_release_path(path);
5293 next_key:
5294                 if (min_key->offset < (u64)-1) {
5295                         min_key->offset++;
5296                 } else if (min_key->type < max_key->type) {
5297                         min_key->type++;
5298                         min_key->offset = 0;
5299                 } else {
5300                         break;
5301                 }
5302         }
5303         if (ins_nr)
5304                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5305                                  ins_nr, inode_only, logged_isize);
5306
5307         return ret;
5308 }
5309
5310 /* log a single inode in the tree log.
5311  * At least one parent directory for this inode must exist in the tree
5312  * or be logged already.
5313  *
5314  * Any items from this inode changed by the current transaction are copied
5315  * to the log tree.  An extra reference is taken on any extents in this
5316  * file, allowing us to avoid a whole pile of corner cases around logging
5317  * blocks that have been removed from the tree.
5318  *
5319  * See LOG_INODE_ALL and related defines for a description of what inode_only
5320  * does.
5321  *
5322  * This handles both files and directories.
5323  */
5324 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5325                            struct btrfs_root *root, struct btrfs_inode *inode,
5326                            int inode_only,
5327                            struct btrfs_log_ctx *ctx)
5328 {
5329         struct btrfs_path *path;
5330         struct btrfs_path *dst_path;
5331         struct btrfs_key min_key;
5332         struct btrfs_key max_key;
5333         struct btrfs_root *log = root->log_root;
5334         int err = 0;
5335         int ret = 0;
5336         bool fast_search = false;
5337         u64 ino = btrfs_ino(inode);
5338         struct extent_map_tree *em_tree = &inode->extent_tree;
5339         u64 logged_isize = 0;
5340         bool need_log_inode_item = true;
5341         bool xattrs_logged = false;
5342         bool recursive_logging = false;
5343         bool inode_item_dropped = true;
5344
5345         path = btrfs_alloc_path();
5346         if (!path)
5347                 return -ENOMEM;
5348         dst_path = btrfs_alloc_path();
5349         if (!dst_path) {
5350                 btrfs_free_path(path);
5351                 return -ENOMEM;
5352         }
5353
5354         min_key.objectid = ino;
5355         min_key.type = BTRFS_INODE_ITEM_KEY;
5356         min_key.offset = 0;
5357
5358         max_key.objectid = ino;
5359
5360
5361         /* today the code can only do partial logging of directories */
5362         if (S_ISDIR(inode->vfs_inode.i_mode) ||
5363             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5364                        &inode->runtime_flags) &&
5365              inode_only >= LOG_INODE_EXISTS))
5366                 max_key.type = BTRFS_XATTR_ITEM_KEY;
5367         else
5368                 max_key.type = (u8)-1;
5369         max_key.offset = (u64)-1;
5370
5371         /*
5372          * Only run delayed items if we are a directory. We want to make sure
5373          * all directory indexes hit the fs/subvolume tree so we can find them
5374          * and figure out which index ranges have to be logged.
5375          *
5376          * Otherwise commit the delayed inode only if the full sync flag is set,
5377          * as we want to make sure an up to date version is in the subvolume
5378          * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5379          * it to the log tree. For a non full sync, we always log the inode item
5380          * based on the in-memory struct btrfs_inode which is always up to date.
5381          */
5382         if (S_ISDIR(inode->vfs_inode.i_mode))
5383                 ret = btrfs_commit_inode_delayed_items(trans, inode);
5384         else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5385                 ret = btrfs_commit_inode_delayed_inode(inode);
5386
5387         if (ret) {
5388                 btrfs_free_path(path);
5389                 btrfs_free_path(dst_path);
5390                 return ret;
5391         }
5392
5393         if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5394                 recursive_logging = true;
5395                 if (inode_only == LOG_OTHER_INODE)
5396                         inode_only = LOG_INODE_EXISTS;
5397                 else
5398                         inode_only = LOG_INODE_ALL;
5399                 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5400         } else {
5401                 mutex_lock(&inode->log_mutex);
5402         }
5403
5404         /*
5405          * This is for cases where logging a directory could result in losing a
5406          * a file after replaying the log. For example, if we move a file from a
5407          * directory A to a directory B, then fsync directory A, we have no way
5408          * to known the file was moved from A to B, so logging just A would
5409          * result in losing the file after a log replay.
5410          */
5411         if (S_ISDIR(inode->vfs_inode.i_mode) &&
5412             inode_only == LOG_INODE_ALL &&
5413             inode->last_unlink_trans >= trans->transid) {
5414                 btrfs_set_log_full_commit(trans);
5415                 err = 1;
5416                 goto out_unlock;
5417         }
5418
5419         /*
5420          * a brute force approach to making sure we get the most uptodate
5421          * copies of everything.
5422          */
5423         if (S_ISDIR(inode->vfs_inode.i_mode)) {
5424                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5425
5426                 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
5427                 if (inode_only == LOG_INODE_EXISTS)
5428                         max_key_type = BTRFS_XATTR_ITEM_KEY;
5429                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5430         } else {
5431                 if (inode_only == LOG_INODE_EXISTS) {
5432                         /*
5433                          * Make sure the new inode item we write to the log has
5434                          * the same isize as the current one (if it exists).
5435                          * This is necessary to prevent data loss after log
5436                          * replay, and also to prevent doing a wrong expanding
5437                          * truncate - for e.g. create file, write 4K into offset
5438                          * 0, fsync, write 4K into offset 4096, add hard link,
5439                          * fsync some other file (to sync log), power fail - if
5440                          * we use the inode's current i_size, after log replay
5441                          * we get a 8Kb file, with the last 4Kb extent as a hole
5442                          * (zeroes), as if an expanding truncate happened,
5443                          * instead of getting a file of 4Kb only.
5444                          */
5445                         err = logged_inode_size(log, inode, path, &logged_isize);
5446                         if (err)
5447                                 goto out_unlock;
5448                 }
5449                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5450                              &inode->runtime_flags)) {
5451                         if (inode_only == LOG_INODE_EXISTS) {
5452                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
5453                                 ret = drop_objectid_items(trans, log, path, ino,
5454                                                           max_key.type);
5455                         } else {
5456                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5457                                           &inode->runtime_flags);
5458                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5459                                           &inode->runtime_flags);
5460                                 while(1) {
5461                                         ret = btrfs_truncate_inode_items(trans,
5462                                                 log, inode, 0, 0, NULL);
5463                                         if (ret != -EAGAIN)
5464                                                 break;
5465                                 }
5466                         }
5467                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5468                                               &inode->runtime_flags) ||
5469                            inode_only == LOG_INODE_EXISTS) {
5470                         if (inode_only == LOG_INODE_ALL)
5471                                 fast_search = true;
5472                         max_key.type = BTRFS_XATTR_ITEM_KEY;
5473                         ret = drop_objectid_items(trans, log, path, ino,
5474                                                   max_key.type);
5475                 } else {
5476                         if (inode_only == LOG_INODE_ALL)
5477                                 fast_search = true;
5478                         inode_item_dropped = false;
5479                         goto log_extents;
5480                 }
5481
5482         }
5483         if (ret) {
5484                 err = ret;
5485                 goto out_unlock;
5486         }
5487
5488         err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5489                                       path, dst_path, logged_isize,
5490                                       recursive_logging, inode_only, ctx,
5491                                       &need_log_inode_item);
5492         if (err)
5493                 goto out_unlock;
5494
5495         btrfs_release_path(path);
5496         btrfs_release_path(dst_path);
5497         err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5498         if (err)
5499                 goto out_unlock;
5500         xattrs_logged = true;
5501         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5502                 btrfs_release_path(path);
5503                 btrfs_release_path(dst_path);
5504                 err = btrfs_log_holes(trans, root, inode, path);
5505                 if (err)
5506                         goto out_unlock;
5507         }
5508 log_extents:
5509         btrfs_release_path(path);
5510         btrfs_release_path(dst_path);
5511         if (need_log_inode_item) {
5512                 err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
5513                 if (err)
5514                         goto out_unlock;
5515                 /*
5516                  * If we are doing a fast fsync and the inode was logged before
5517                  * in this transaction, we don't need to log the xattrs because
5518                  * they were logged before. If xattrs were added, changed or
5519                  * deleted since the last time we logged the inode, then we have
5520                  * already logged them because the inode had the runtime flag
5521                  * BTRFS_INODE_COPY_EVERYTHING set.
5522                  */
5523                 if (!xattrs_logged && inode->logged_trans < trans->transid) {
5524                         err = btrfs_log_all_xattrs(trans, root, inode, path,
5525                                                    dst_path);
5526                         if (err)
5527                                 goto out_unlock;
5528                         btrfs_release_path(path);
5529                 }
5530         }
5531         if (fast_search) {
5532                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5533                                                 ctx);
5534                 if (ret) {
5535                         err = ret;
5536                         goto out_unlock;
5537                 }
5538         } else if (inode_only == LOG_INODE_ALL) {
5539                 struct extent_map *em, *n;
5540
5541                 write_lock(&em_tree->lock);
5542                 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5543                         list_del_init(&em->list);
5544                 write_unlock(&em_tree->lock);
5545         }
5546
5547         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5548                 ret = log_directory_changes(trans, root, inode, path, dst_path,
5549                                         ctx);
5550                 if (ret) {
5551                         err = ret;
5552                         goto out_unlock;
5553                 }
5554         }
5555
5556         /*
5557          * If we are logging that an ancestor inode exists as part of logging a
5558          * new name from a link or rename operation, don't mark the inode as
5559          * logged - otherwise if an explicit fsync is made against an ancestor,
5560          * the fsync considers the inode in the log and doesn't sync the log,
5561          * resulting in the ancestor missing after a power failure unless the
5562          * log was synced as part of an fsync against any other unrelated inode.
5563          * So keep it simple for this case and just don't flag the ancestors as
5564          * logged.
5565          */
5566         if (!ctx ||
5567             !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5568               &inode->vfs_inode != ctx->inode)) {
5569                 spin_lock(&inode->lock);
5570                 inode->logged_trans = trans->transid;
5571                 /*
5572                  * Don't update last_log_commit if we logged that an inode exists.
5573                  * We do this for two reasons:
5574                  *
5575                  * 1) We might have had buffered writes to this inode that were
5576                  *    flushed and had their ordered extents completed in this
5577                  *    transaction, but we did not previously log the inode with
5578                  *    LOG_INODE_ALL. Later the inode was evicted and after that
5579                  *    it was loaded again and this LOG_INODE_EXISTS log operation
5580                  *    happened. We must make sure that if an explicit fsync against
5581                  *    the inode is performed later, it logs the new extents, an
5582                  *    updated inode item, etc, and syncs the log. The same logic
5583                  *    applies to direct IO writes instead of buffered writes.
5584                  *
5585                  * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
5586                  *    is logged with an i_size of 0 or whatever value was logged
5587                  *    before. If later the i_size of the inode is increased by a
5588                  *    truncate operation, the log is synced through an fsync of
5589                  *    some other inode and then finally an explicit fsync against
5590                  *    this inode is made, we must make sure this fsync logs the
5591                  *    inode with the new i_size, the hole between old i_size and
5592                  *    the new i_size, and syncs the log.
5593                  */
5594                 if (inode_only != LOG_INODE_EXISTS)
5595                         inode->last_log_commit = inode->last_sub_trans;
5596                 spin_unlock(&inode->lock);
5597         }
5598 out_unlock:
5599         mutex_unlock(&inode->log_mutex);
5600
5601         btrfs_free_path(path);
5602         btrfs_free_path(dst_path);
5603         return err;
5604 }
5605
5606 /*
5607  * Check if we need to log an inode. This is used in contexts where while
5608  * logging an inode we need to log another inode (either that it exists or in
5609  * full mode). This is used instead of btrfs_inode_in_log() because the later
5610  * requires the inode to be in the log and have the log transaction committed,
5611  * while here we do not care if the log transaction was already committed - our
5612  * caller will commit the log later - and we want to avoid logging an inode
5613  * multiple times when multiple tasks have joined the same log transaction.
5614  */
5615 static bool need_log_inode(struct btrfs_trans_handle *trans,
5616                            struct btrfs_inode *inode)
5617 {
5618         /*
5619          * If a directory was not modified, no dentries added or removed, we can
5620          * and should avoid logging it.
5621          */
5622         if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5623                 return false;
5624
5625         /*
5626          * If this inode does not have new/updated/deleted xattrs since the last
5627          * time it was logged and is flagged as logged in the current transaction,
5628          * we can skip logging it. As for new/deleted names, those are updated in
5629          * the log by link/unlink/rename operations.
5630          * In case the inode was logged and then evicted and reloaded, its
5631          * logged_trans will be 0, in which case we have to fully log it since
5632          * logged_trans is a transient field, not persisted.
5633          */
5634         if (inode->logged_trans == trans->transid &&
5635             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5636                 return false;
5637
5638         return true;
5639 }
5640
5641 struct btrfs_dir_list {
5642         u64 ino;
5643         struct list_head list;
5644 };
5645
5646 /*
5647  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5648  * details about the why it is needed.
5649  * This is a recursive operation - if an existing dentry corresponds to a
5650  * directory, that directory's new entries are logged too (same behaviour as
5651  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5652  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5653  * complains about the following circular lock dependency / possible deadlock:
5654  *
5655  *        CPU0                                        CPU1
5656  *        ----                                        ----
5657  * lock(&type->i_mutex_dir_key#3/2);
5658  *                                            lock(sb_internal#2);
5659  *                                            lock(&type->i_mutex_dir_key#3/2);
5660  * lock(&sb->s_type->i_mutex_key#14);
5661  *
5662  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5663  * sb_start_intwrite() in btrfs_start_transaction().
5664  * Not locking i_mutex of the inodes is still safe because:
5665  *
5666  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5667  *    that while logging the inode new references (names) are added or removed
5668  *    from the inode, leaving the logged inode item with a link count that does
5669  *    not match the number of logged inode reference items. This is fine because
5670  *    at log replay time we compute the real number of links and correct the
5671  *    link count in the inode item (see replay_one_buffer() and
5672  *    link_to_fixup_dir());
5673  *
5674  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5675  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5676  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5677  *    has a size that doesn't match the sum of the lengths of all the logged
5678  *    names. This does not result in a problem because if a dir_item key is
5679  *    logged but its matching dir_index key is not logged, at log replay time we
5680  *    don't use it to replay the respective name (see replay_one_name()). On the
5681  *    other hand if only the dir_index key ends up being logged, the respective
5682  *    name is added to the fs/subvol tree with both the dir_item and dir_index
5683  *    keys created (see replay_one_name()).
5684  *    The directory's inode item with a wrong i_size is not a problem as well,
5685  *    since we don't use it at log replay time to set the i_size in the inode
5686  *    item of the fs/subvol tree (see overwrite_item()).
5687  */
5688 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5689                                 struct btrfs_root *root,
5690                                 struct btrfs_inode *start_inode,
5691                                 struct btrfs_log_ctx *ctx)
5692 {
5693         struct btrfs_fs_info *fs_info = root->fs_info;
5694         struct btrfs_root *log = root->log_root;
5695         struct btrfs_path *path;
5696         LIST_HEAD(dir_list);
5697         struct btrfs_dir_list *dir_elem;
5698         int ret = 0;
5699
5700         path = btrfs_alloc_path();
5701         if (!path)
5702                 return -ENOMEM;
5703
5704         dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5705         if (!dir_elem) {
5706                 btrfs_free_path(path);
5707                 return -ENOMEM;
5708         }
5709         dir_elem->ino = btrfs_ino(start_inode);
5710         list_add_tail(&dir_elem->list, &dir_list);
5711
5712         while (!list_empty(&dir_list)) {
5713                 struct extent_buffer *leaf;
5714                 struct btrfs_key min_key;
5715                 int nritems;
5716                 int i;
5717
5718                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5719                                             list);
5720                 if (ret)
5721                         goto next_dir_inode;
5722
5723                 min_key.objectid = dir_elem->ino;
5724                 min_key.type = BTRFS_DIR_ITEM_KEY;
5725                 min_key.offset = 0;
5726 again:
5727                 btrfs_release_path(path);
5728                 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5729                 if (ret < 0) {
5730                         goto next_dir_inode;
5731                 } else if (ret > 0) {
5732                         ret = 0;
5733                         goto next_dir_inode;
5734                 }
5735
5736 process_leaf:
5737                 leaf = path->nodes[0];
5738                 nritems = btrfs_header_nritems(leaf);
5739                 for (i = path->slots[0]; i < nritems; i++) {
5740                         struct btrfs_dir_item *di;
5741                         struct btrfs_key di_key;
5742                         struct inode *di_inode;
5743                         struct btrfs_dir_list *new_dir_elem;
5744                         int log_mode = LOG_INODE_EXISTS;
5745                         int type;
5746
5747                         btrfs_item_key_to_cpu(leaf, &min_key, i);
5748                         if (min_key.objectid != dir_elem->ino ||
5749                             min_key.type != BTRFS_DIR_ITEM_KEY)
5750                                 goto next_dir_inode;
5751
5752                         di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5753                         type = btrfs_dir_type(leaf, di);
5754                         if (btrfs_dir_transid(leaf, di) < trans->transid &&
5755                             type != BTRFS_FT_DIR)
5756                                 continue;
5757                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5758                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5759                                 continue;
5760
5761                         btrfs_release_path(path);
5762                         di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5763                         if (IS_ERR(di_inode)) {
5764                                 ret = PTR_ERR(di_inode);
5765                                 goto next_dir_inode;
5766                         }
5767
5768                         if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5769                                 btrfs_add_delayed_iput(di_inode);
5770                                 break;
5771                         }
5772
5773                         ctx->log_new_dentries = false;
5774                         if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5775                                 log_mode = LOG_INODE_ALL;
5776                         ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5777                                               log_mode, ctx);
5778                         btrfs_add_delayed_iput(di_inode);
5779                         if (ret)
5780                                 goto next_dir_inode;
5781                         if (ctx->log_new_dentries) {
5782                                 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5783                                                        GFP_NOFS);
5784                                 if (!new_dir_elem) {
5785                                         ret = -ENOMEM;
5786                                         goto next_dir_inode;
5787                                 }
5788                                 new_dir_elem->ino = di_key.objectid;
5789                                 list_add_tail(&new_dir_elem->list, &dir_list);
5790                         }
5791                         break;
5792                 }
5793                 if (i == nritems) {
5794                         ret = btrfs_next_leaf(log, path);
5795                         if (ret < 0) {
5796                                 goto next_dir_inode;
5797                         } else if (ret > 0) {
5798                                 ret = 0;
5799                                 goto next_dir_inode;
5800                         }
5801                         goto process_leaf;
5802                 }
5803                 if (min_key.offset < (u64)-1) {
5804                         min_key.offset++;
5805                         goto again;
5806                 }
5807 next_dir_inode:
5808                 list_del(&dir_elem->list);
5809                 kfree(dir_elem);
5810         }
5811
5812         btrfs_free_path(path);
5813         return ret;
5814 }
5815
5816 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5817                                  struct btrfs_inode *inode,
5818                                  struct btrfs_log_ctx *ctx)
5819 {
5820         struct btrfs_fs_info *fs_info = trans->fs_info;
5821         int ret;
5822         struct btrfs_path *path;
5823         struct btrfs_key key;
5824         struct btrfs_root *root = inode->root;
5825         const u64 ino = btrfs_ino(inode);
5826
5827         path = btrfs_alloc_path();
5828         if (!path)
5829                 return -ENOMEM;
5830         path->skip_locking = 1;
5831         path->search_commit_root = 1;
5832
5833         key.objectid = ino;
5834         key.type = BTRFS_INODE_REF_KEY;
5835         key.offset = 0;
5836         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5837         if (ret < 0)
5838                 goto out;
5839
5840         while (true) {
5841                 struct extent_buffer *leaf = path->nodes[0];
5842                 int slot = path->slots[0];
5843                 u32 cur_offset = 0;
5844                 u32 item_size;
5845                 unsigned long ptr;
5846
5847                 if (slot >= btrfs_header_nritems(leaf)) {
5848                         ret = btrfs_next_leaf(root, path);
5849                         if (ret < 0)
5850                                 goto out;
5851                         else if (ret > 0)
5852                                 break;
5853                         continue;
5854                 }
5855
5856                 btrfs_item_key_to_cpu(leaf, &key, slot);
5857                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5858                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5859                         break;
5860
5861                 item_size = btrfs_item_size_nr(leaf, slot);
5862                 ptr = btrfs_item_ptr_offset(leaf, slot);
5863                 while (cur_offset < item_size) {
5864                         struct btrfs_key inode_key;
5865                         struct inode *dir_inode;
5866
5867                         inode_key.type = BTRFS_INODE_ITEM_KEY;
5868                         inode_key.offset = 0;
5869
5870                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
5871                                 struct btrfs_inode_extref *extref;
5872
5873                                 extref = (struct btrfs_inode_extref *)
5874                                         (ptr + cur_offset);
5875                                 inode_key.objectid = btrfs_inode_extref_parent(
5876                                         leaf, extref);
5877                                 cur_offset += sizeof(*extref);
5878                                 cur_offset += btrfs_inode_extref_name_len(leaf,
5879                                         extref);
5880                         } else {
5881                                 inode_key.objectid = key.offset;
5882                                 cur_offset = item_size;
5883                         }
5884
5885                         dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5886                                                root);
5887                         /*
5888                          * If the parent inode was deleted, return an error to
5889                          * fallback to a transaction commit. This is to prevent
5890                          * getting an inode that was moved from one parent A to
5891                          * a parent B, got its former parent A deleted and then
5892                          * it got fsync'ed, from existing at both parents after
5893                          * a log replay (and the old parent still existing).
5894                          * Example:
5895                          *
5896                          * mkdir /mnt/A
5897                          * mkdir /mnt/B
5898                          * touch /mnt/B/bar
5899                          * sync
5900                          * mv /mnt/B/bar /mnt/A/bar
5901                          * mv -T /mnt/A /mnt/B
5902                          * fsync /mnt/B/bar
5903                          * <power fail>
5904                          *
5905                          * If we ignore the old parent B which got deleted,
5906                          * after a log replay we would have file bar linked
5907                          * at both parents and the old parent B would still
5908                          * exist.
5909                          */
5910                         if (IS_ERR(dir_inode)) {
5911                                 ret = PTR_ERR(dir_inode);
5912                                 goto out;
5913                         }
5914
5915                         if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
5916                                 btrfs_add_delayed_iput(dir_inode);
5917                                 continue;
5918                         }
5919
5920                         if (ctx)
5921                                 ctx->log_new_dentries = false;
5922                         ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5923                                               LOG_INODE_ALL, ctx);
5924                         if (!ret && ctx && ctx->log_new_dentries)
5925                                 ret = log_new_dir_dentries(trans, root,
5926                                                    BTRFS_I(dir_inode), ctx);
5927                         btrfs_add_delayed_iput(dir_inode);
5928                         if (ret)
5929                                 goto out;
5930                 }
5931                 path->slots[0]++;
5932         }
5933         ret = 0;
5934 out:
5935         btrfs_free_path(path);
5936         return ret;
5937 }
5938
5939 static int log_new_ancestors(struct btrfs_trans_handle *trans,
5940                              struct btrfs_root *root,
5941                              struct btrfs_path *path,
5942                              struct btrfs_log_ctx *ctx)
5943 {
5944         struct btrfs_key found_key;
5945
5946         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5947
5948         while (true) {
5949                 struct btrfs_fs_info *fs_info = root->fs_info;
5950                 struct extent_buffer *leaf = path->nodes[0];
5951                 int slot = path->slots[0];
5952                 struct btrfs_key search_key;
5953                 struct inode *inode;
5954                 u64 ino;
5955                 int ret = 0;
5956
5957                 btrfs_release_path(path);
5958
5959                 ino = found_key.offset;
5960
5961                 search_key.objectid = found_key.offset;
5962                 search_key.type = BTRFS_INODE_ITEM_KEY;
5963                 search_key.offset = 0;
5964                 inode = btrfs_iget(fs_info->sb, ino, root);
5965                 if (IS_ERR(inode))
5966                         return PTR_ERR(inode);
5967
5968                 if (BTRFS_I(inode)->generation >= trans->transid &&
5969                     need_log_inode(trans, BTRFS_I(inode)))
5970                         ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5971                                               LOG_INODE_EXISTS, ctx);
5972                 btrfs_add_delayed_iput(inode);
5973                 if (ret)
5974                         return ret;
5975
5976                 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5977                         break;
5978
5979                 search_key.type = BTRFS_INODE_REF_KEY;
5980                 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5981                 if (ret < 0)
5982                         return ret;
5983
5984                 leaf = path->nodes[0];
5985                 slot = path->slots[0];
5986                 if (slot >= btrfs_header_nritems(leaf)) {
5987                         ret = btrfs_next_leaf(root, path);
5988                         if (ret < 0)
5989                                 return ret;
5990                         else if (ret > 0)
5991                                 return -ENOENT;
5992                         leaf = path->nodes[0];
5993                         slot = path->slots[0];
5994                 }
5995
5996                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5997                 if (found_key.objectid != search_key.objectid ||
5998                     found_key.type != BTRFS_INODE_REF_KEY)
5999                         return -ENOENT;
6000         }
6001         return 0;
6002 }
6003
6004 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6005                                   struct btrfs_inode *inode,
6006                                   struct dentry *parent,
6007                                   struct btrfs_log_ctx *ctx)
6008 {
6009         struct btrfs_root *root = inode->root;
6010         struct dentry *old_parent = NULL;
6011         struct super_block *sb = inode->vfs_inode.i_sb;
6012         int ret = 0;
6013
6014         while (true) {
6015                 if (!parent || d_really_is_negative(parent) ||
6016                     sb != parent->d_sb)
6017                         break;
6018
6019                 inode = BTRFS_I(d_inode(parent));
6020                 if (root != inode->root)
6021                         break;
6022
6023                 if (inode->generation >= trans->transid &&
6024                     need_log_inode(trans, inode)) {
6025                         ret = btrfs_log_inode(trans, root, inode,
6026                                               LOG_INODE_EXISTS, ctx);
6027                         if (ret)
6028                                 break;
6029                 }
6030                 if (IS_ROOT(parent))
6031                         break;
6032
6033                 parent = dget_parent(parent);
6034                 dput(old_parent);
6035                 old_parent = parent;
6036         }
6037         dput(old_parent);
6038
6039         return ret;
6040 }
6041
6042 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6043                                  struct btrfs_inode *inode,
6044                                  struct dentry *parent,
6045                                  struct btrfs_log_ctx *ctx)
6046 {
6047         struct btrfs_root *root = inode->root;
6048         const u64 ino = btrfs_ino(inode);
6049         struct btrfs_path *path;
6050         struct btrfs_key search_key;
6051         int ret;
6052
6053         /*
6054          * For a single hard link case, go through a fast path that does not
6055          * need to iterate the fs/subvolume tree.
6056          */
6057         if (inode->vfs_inode.i_nlink < 2)
6058                 return log_new_ancestors_fast(trans, inode, parent, ctx);
6059
6060         path = btrfs_alloc_path();
6061         if (!path)
6062                 return -ENOMEM;
6063
6064         search_key.objectid = ino;
6065         search_key.type = BTRFS_INODE_REF_KEY;
6066         search_key.offset = 0;
6067 again:
6068         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6069         if (ret < 0)
6070                 goto out;
6071         if (ret == 0)
6072                 path->slots[0]++;
6073
6074         while (true) {
6075                 struct extent_buffer *leaf = path->nodes[0];
6076                 int slot = path->slots[0];
6077                 struct btrfs_key found_key;
6078
6079                 if (slot >= btrfs_header_nritems(leaf)) {
6080                         ret = btrfs_next_leaf(root, path);
6081                         if (ret < 0)
6082                                 goto out;
6083                         else if (ret > 0)
6084                                 break;
6085                         continue;
6086                 }
6087
6088                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6089                 if (found_key.objectid != ino ||
6090                     found_key.type > BTRFS_INODE_EXTREF_KEY)
6091                         break;
6092
6093                 /*
6094                  * Don't deal with extended references because they are rare
6095                  * cases and too complex to deal with (we would need to keep
6096                  * track of which subitem we are processing for each item in
6097                  * this loop, etc). So just return some error to fallback to
6098                  * a transaction commit.
6099                  */
6100                 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6101                         ret = -EMLINK;
6102                         goto out;
6103                 }
6104
6105                 /*
6106                  * Logging ancestors needs to do more searches on the fs/subvol
6107                  * tree, so it releases the path as needed to avoid deadlocks.
6108                  * Keep track of the last inode ref key and resume from that key
6109                  * after logging all new ancestors for the current hard link.
6110                  */
6111                 memcpy(&search_key, &found_key, sizeof(search_key));
6112
6113                 ret = log_new_ancestors(trans, root, path, ctx);
6114                 if (ret)
6115                         goto out;
6116                 btrfs_release_path(path);
6117                 goto again;
6118         }
6119         ret = 0;
6120 out:
6121         btrfs_free_path(path);
6122         return ret;
6123 }
6124
6125 /*
6126  * helper function around btrfs_log_inode to make sure newly created
6127  * parent directories also end up in the log.  A minimal inode and backref
6128  * only logging is done of any parent directories that are older than
6129  * the last committed transaction
6130  */
6131 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6132                                   struct btrfs_inode *inode,
6133                                   struct dentry *parent,
6134                                   int inode_only,
6135                                   struct btrfs_log_ctx *ctx)
6136 {
6137         struct btrfs_root *root = inode->root;
6138         struct btrfs_fs_info *fs_info = root->fs_info;
6139         int ret = 0;
6140         bool log_dentries = false;
6141
6142         if (btrfs_test_opt(fs_info, NOTREELOG)) {
6143                 ret = 1;
6144                 goto end_no_trans;
6145         }
6146
6147         if (btrfs_root_refs(&root->root_item) == 0) {
6148                 ret = 1;
6149                 goto end_no_trans;
6150         }
6151
6152         /*
6153          * Skip already logged inodes or inodes corresponding to tmpfiles
6154          * (since logging them is pointless, a link count of 0 means they
6155          * will never be accessible).
6156          */
6157         if ((btrfs_inode_in_log(inode, trans->transid) &&
6158              list_empty(&ctx->ordered_extents)) ||
6159             inode->vfs_inode.i_nlink == 0) {
6160                 ret = BTRFS_NO_LOG_SYNC;
6161                 goto end_no_trans;
6162         }
6163
6164         ret = start_log_trans(trans, root, ctx);
6165         if (ret)
6166                 goto end_no_trans;
6167
6168         ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
6169         if (ret)
6170                 goto end_trans;
6171
6172         /*
6173          * for regular files, if its inode is already on disk, we don't
6174          * have to worry about the parents at all.  This is because
6175          * we can use the last_unlink_trans field to record renames
6176          * and other fun in this file.
6177          */
6178         if (S_ISREG(inode->vfs_inode.i_mode) &&
6179             inode->generation < trans->transid &&
6180             inode->last_unlink_trans < trans->transid) {
6181                 ret = 0;
6182                 goto end_trans;
6183         }
6184
6185         if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6186                 log_dentries = true;
6187
6188         /*
6189          * On unlink we must make sure all our current and old parent directory
6190          * inodes are fully logged. This is to prevent leaving dangling
6191          * directory index entries in directories that were our parents but are
6192          * not anymore. Not doing this results in old parent directory being
6193          * impossible to delete after log replay (rmdir will always fail with
6194          * error -ENOTEMPTY).
6195          *
6196          * Example 1:
6197          *
6198          * mkdir testdir
6199          * touch testdir/foo
6200          * ln testdir/foo testdir/bar
6201          * sync
6202          * unlink testdir/bar
6203          * xfs_io -c fsync testdir/foo
6204          * <power failure>
6205          * mount fs, triggers log replay
6206          *
6207          * If we don't log the parent directory (testdir), after log replay the
6208          * directory still has an entry pointing to the file inode using the bar
6209          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6210          * the file inode has a link count of 1.
6211          *
6212          * Example 2:
6213          *
6214          * mkdir testdir
6215          * touch foo
6216          * ln foo testdir/foo2
6217          * ln foo testdir/foo3
6218          * sync
6219          * unlink testdir/foo3
6220          * xfs_io -c fsync foo
6221          * <power failure>
6222          * mount fs, triggers log replay
6223          *
6224          * Similar as the first example, after log replay the parent directory
6225          * testdir still has an entry pointing to the inode file with name foo3
6226          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6227          * and has a link count of 2.
6228          */
6229         if (inode->last_unlink_trans >= trans->transid) {
6230                 ret = btrfs_log_all_parents(trans, inode, ctx);
6231                 if (ret)
6232                         goto end_trans;
6233         }
6234
6235         ret = log_all_new_ancestors(trans, inode, parent, ctx);
6236         if (ret)
6237                 goto end_trans;
6238
6239         if (log_dentries)
6240                 ret = log_new_dir_dentries(trans, root, inode, ctx);
6241         else
6242                 ret = 0;
6243 end_trans:
6244         if (ret < 0) {
6245                 btrfs_set_log_full_commit(trans);
6246                 ret = 1;
6247         }
6248
6249         if (ret)
6250                 btrfs_remove_log_ctx(root, ctx);
6251         btrfs_end_log_trans(root);
6252 end_no_trans:
6253         return ret;
6254 }
6255
6256 /*
6257  * it is not safe to log dentry if the chunk root has added new
6258  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6259  * If this returns 1, you must commit the transaction to safely get your
6260  * data on disk.
6261  */
6262 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6263                           struct dentry *dentry,
6264                           struct btrfs_log_ctx *ctx)
6265 {
6266         struct dentry *parent = dget_parent(dentry);
6267         int ret;
6268
6269         ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6270                                      LOG_INODE_ALL, ctx);
6271         dput(parent);
6272
6273         return ret;
6274 }
6275
6276 /*
6277  * should be called during mount to recover any replay any log trees
6278  * from the FS
6279  */
6280 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6281 {
6282         int ret;
6283         struct btrfs_path *path;
6284         struct btrfs_trans_handle *trans;
6285         struct btrfs_key key;
6286         struct btrfs_key found_key;
6287         struct btrfs_root *log;
6288         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6289         struct walk_control wc = {
6290                 .process_func = process_one_buffer,
6291                 .stage = LOG_WALK_PIN_ONLY,
6292         };
6293
6294         path = btrfs_alloc_path();
6295         if (!path)
6296                 return -ENOMEM;
6297
6298         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6299
6300         trans = btrfs_start_transaction(fs_info->tree_root, 0);
6301         if (IS_ERR(trans)) {
6302                 ret = PTR_ERR(trans);
6303                 goto error;
6304         }
6305
6306         wc.trans = trans;
6307         wc.pin = 1;
6308
6309         ret = walk_log_tree(trans, log_root_tree, &wc);
6310         if (ret) {
6311                 btrfs_handle_fs_error(fs_info, ret,
6312                         "Failed to pin buffers while recovering log root tree.");
6313                 goto error;
6314         }
6315
6316 again:
6317         key.objectid = BTRFS_TREE_LOG_OBJECTID;
6318         key.offset = (u64)-1;
6319         key.type = BTRFS_ROOT_ITEM_KEY;
6320
6321         while (1) {
6322                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6323
6324                 if (ret < 0) {
6325                         btrfs_handle_fs_error(fs_info, ret,
6326                                     "Couldn't find tree log root.");
6327                         goto error;
6328                 }
6329                 if (ret > 0) {
6330                         if (path->slots[0] == 0)
6331                                 break;
6332                         path->slots[0]--;
6333                 }
6334                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6335                                       path->slots[0]);
6336                 btrfs_release_path(path);
6337                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6338                         break;
6339
6340                 log = btrfs_read_tree_root(log_root_tree, &found_key);
6341                 if (IS_ERR(log)) {
6342                         ret = PTR_ERR(log);
6343                         btrfs_handle_fs_error(fs_info, ret,
6344                                     "Couldn't read tree log root.");
6345                         goto error;
6346                 }
6347
6348                 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6349                                                    true);
6350                 if (IS_ERR(wc.replay_dest)) {
6351                         ret = PTR_ERR(wc.replay_dest);
6352
6353                         /*
6354                          * We didn't find the subvol, likely because it was
6355                          * deleted.  This is ok, simply skip this log and go to
6356                          * the next one.
6357                          *
6358                          * We need to exclude the root because we can't have
6359                          * other log replays overwriting this log as we'll read
6360                          * it back in a few more times.  This will keep our
6361                          * block from being modified, and we'll just bail for
6362                          * each subsequent pass.
6363                          */
6364                         if (ret == -ENOENT)
6365                                 ret = btrfs_pin_extent_for_log_replay(trans,
6366                                                         log->node->start,
6367                                                         log->node->len);
6368                         btrfs_put_root(log);
6369
6370                         if (!ret)
6371                                 goto next;
6372                         btrfs_handle_fs_error(fs_info, ret,
6373                                 "Couldn't read target root for tree log recovery.");
6374                         goto error;
6375                 }
6376
6377                 wc.replay_dest->log_root = log;
6378                 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
6379                 if (ret)
6380                         /* The loop needs to continue due to the root refs */
6381                         btrfs_handle_fs_error(fs_info, ret,
6382                                 "failed to record the log root in transaction");
6383                 else
6384                         ret = walk_log_tree(trans, log, &wc);
6385
6386                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6387                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
6388                                                       path);
6389                 }
6390
6391                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6392                         struct btrfs_root *root = wc.replay_dest;
6393
6394                         btrfs_release_path(path);
6395
6396                         /*
6397                          * We have just replayed everything, and the highest
6398                          * objectid of fs roots probably has changed in case
6399                          * some inode_item's got replayed.
6400                          *
6401                          * root->objectid_mutex is not acquired as log replay
6402                          * could only happen during mount.
6403                          */
6404                         ret = btrfs_init_root_free_objectid(root);
6405                 }
6406
6407                 wc.replay_dest->log_root = NULL;
6408                 btrfs_put_root(wc.replay_dest);
6409                 btrfs_put_root(log);
6410
6411                 if (ret)
6412                         goto error;
6413 next:
6414                 if (found_key.offset == 0)
6415                         break;
6416                 key.offset = found_key.offset - 1;
6417         }
6418         btrfs_release_path(path);
6419
6420         /* step one is to pin it all, step two is to replay just inodes */
6421         if (wc.pin) {
6422                 wc.pin = 0;
6423                 wc.process_func = replay_one_buffer;
6424                 wc.stage = LOG_WALK_REPLAY_INODES;
6425                 goto again;
6426         }
6427         /* step three is to replay everything */
6428         if (wc.stage < LOG_WALK_REPLAY_ALL) {
6429                 wc.stage++;
6430                 goto again;
6431         }
6432
6433         btrfs_free_path(path);
6434
6435         /* step 4: commit the transaction, which also unpins the blocks */
6436         ret = btrfs_commit_transaction(trans);
6437         if (ret)
6438                 return ret;
6439
6440         log_root_tree->log_root = NULL;
6441         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6442         btrfs_put_root(log_root_tree);
6443
6444         return 0;
6445 error:
6446         if (wc.trans)
6447                 btrfs_end_transaction(wc.trans);
6448         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6449         btrfs_free_path(path);
6450         return ret;
6451 }
6452
6453 /*
6454  * there are some corner cases where we want to force a full
6455  * commit instead of allowing a directory to be logged.
6456  *
6457  * They revolve around files there were unlinked from the directory, and
6458  * this function updates the parent directory so that a full commit is
6459  * properly done if it is fsync'd later after the unlinks are done.
6460  *
6461  * Must be called before the unlink operations (updates to the subvolume tree,
6462  * inodes, etc) are done.
6463  */
6464 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6465                              struct btrfs_inode *dir, struct btrfs_inode *inode,
6466                              int for_rename)
6467 {
6468         /*
6469          * when we're logging a file, if it hasn't been renamed
6470          * or unlinked, and its inode is fully committed on disk,
6471          * we don't have to worry about walking up the directory chain
6472          * to log its parents.
6473          *
6474          * So, we use the last_unlink_trans field to put this transid
6475          * into the file.  When the file is logged we check it and
6476          * don't log the parents if the file is fully on disk.
6477          */
6478         mutex_lock(&inode->log_mutex);
6479         inode->last_unlink_trans = trans->transid;
6480         mutex_unlock(&inode->log_mutex);
6481
6482         /*
6483          * if this directory was already logged any new
6484          * names for this file/dir will get recorded
6485          */
6486         if (dir->logged_trans == trans->transid)
6487                 return;
6488
6489         /*
6490          * if the inode we're about to unlink was logged,
6491          * the log will be properly updated for any new names
6492          */
6493         if (inode->logged_trans == trans->transid)
6494                 return;
6495
6496         /*
6497          * when renaming files across directories, if the directory
6498          * there we're unlinking from gets fsync'd later on, there's
6499          * no way to find the destination directory later and fsync it
6500          * properly.  So, we have to be conservative and force commits
6501          * so the new name gets discovered.
6502          */
6503         if (for_rename)
6504                 goto record;
6505
6506         /* we can safely do the unlink without any special recording */
6507         return;
6508
6509 record:
6510         mutex_lock(&dir->log_mutex);
6511         dir->last_unlink_trans = trans->transid;
6512         mutex_unlock(&dir->log_mutex);
6513 }
6514
6515 /*
6516  * Make sure that if someone attempts to fsync the parent directory of a deleted
6517  * snapshot, it ends up triggering a transaction commit. This is to guarantee
6518  * that after replaying the log tree of the parent directory's root we will not
6519  * see the snapshot anymore and at log replay time we will not see any log tree
6520  * corresponding to the deleted snapshot's root, which could lead to replaying
6521  * it after replaying the log tree of the parent directory (which would replay
6522  * the snapshot delete operation).
6523  *
6524  * Must be called before the actual snapshot destroy operation (updates to the
6525  * parent root and tree of tree roots trees, etc) are done.
6526  */
6527 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6528                                    struct btrfs_inode *dir)
6529 {
6530         mutex_lock(&dir->log_mutex);
6531         dir->last_unlink_trans = trans->transid;
6532         mutex_unlock(&dir->log_mutex);
6533 }
6534
6535 /*
6536  * Call this after adding a new name for a file and it will properly
6537  * update the log to reflect the new name.
6538  */
6539 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6540                         struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6541                         struct dentry *parent)
6542 {
6543         struct btrfs_log_ctx ctx;
6544
6545         /*
6546          * this will force the logging code to walk the dentry chain
6547          * up for the file
6548          */
6549         if (!S_ISDIR(inode->vfs_inode.i_mode))
6550                 inode->last_unlink_trans = trans->transid;
6551
6552         /*
6553          * if this inode hasn't been logged and directory we're renaming it
6554          * from hasn't been logged, we don't need to log it
6555          */
6556         if (!inode_logged(trans, inode) &&
6557             (!old_dir || !inode_logged(trans, old_dir)))
6558                 return;
6559
6560         /*
6561          * If we are doing a rename (old_dir is not NULL) from a directory that
6562          * was previously logged, make sure the next log attempt on the directory
6563          * is not skipped and logs the inode again. This is because the log may
6564          * not currently be authoritative for a range including the old
6565          * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
6566          * sure after a log replay we do not end up with both the new and old
6567          * dentries around (in case the inode is a directory we would have a
6568          * directory with two hard links and 2 inode references for different
6569          * parents). The next log attempt of old_dir will happen at
6570          * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
6571          * below, because we have previously set inode->last_unlink_trans to the
6572          * current transaction ID, either here or at btrfs_record_unlink_dir() in
6573          * case inode is a directory.
6574          */
6575         if (old_dir)
6576                 old_dir->logged_trans = 0;
6577
6578         btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6579         ctx.logging_new_name = true;
6580         /*
6581          * We don't care about the return value. If we fail to log the new name
6582          * then we know the next attempt to sync the log will fallback to a full
6583          * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6584          * we don't need to worry about getting a log committed that has an
6585          * inconsistent state after a rename operation.
6586          */
6587         btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
6588 }
6589