2 * Ext4 orphan inode handling
5 #include <linux/quotaops.h>
6 #include <linux/buffer_head.h>
11 static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
14 struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
17 int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
19 spin_lock(&oi->of_lock);
20 for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++);
21 if (i == oi->of_blocks) {
22 spin_unlock(&oi->of_lock);
24 * For now we don't grow or shrink orphan file. We just use
25 * whatever was allocated at mke2fs time. The additional
26 * credits we would have to reserve for each orphan inode
27 * operation just don't seem worth it.
31 oi->of_binfo[i].ob_free_entries--;
32 spin_unlock(&oi->of_lock);
35 * Get access to orphan block. We have dropped of_lock but since we
36 * have decremented number of free entries we are guaranteed free entry
39 ret = ext4_journal_get_write_access(handle, inode->i_sb,
40 oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
44 bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
45 spin_lock(&oi->of_lock);
46 /* Find empty slot in a block */
47 for (j = 0; j < inodes_per_ob && bdata[j]; j++);
48 BUG_ON(j == inodes_per_ob);
49 bdata[j] = cpu_to_le32(inode->i_ino);
50 EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
51 ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
52 spin_unlock(&oi->of_lock);
54 return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
58 * ext4_orphan_add() links an unlinked or truncated inode into a list of
59 * such inodes, starting at the superblock, in case we crash before the
60 * file is closed/deleted, or in case the inode truncate spans multiple
61 * transactions and the last transaction is not recovered after a crash.
63 * At filesystem recovery time, we walk this list deleting unlinked
64 * inodes and truncating linked inodes in ext4_orphan_cleanup().
66 * Orphan list manipulation functions must be called under i_mutex unless
67 * we are just creating the inode or deleting it.
69 int ext4_orphan_add(handle_t *handle, struct inode *inode)
71 struct super_block *sb = inode->i_sb;
72 struct ext4_sb_info *sbi = EXT4_SB(sb);
73 struct ext4_iloc iloc;
77 if (!sbi->s_journal || is_bad_inode(inode))
80 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
81 !inode_is_locked(inode));
83 * Inode orphaned in orphan file or in orphan list?
85 if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
86 !list_empty(&EXT4_I(inode)->i_orphan))
90 * Orphan handling is only valid for files with data blocks
91 * being truncated, or files being unlinked. Note that we either
92 * hold i_mutex, or the inode can not be referenced from outside,
93 * so i_nlink should not be bumped due to race
95 ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
96 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
98 if (sbi->s_orphan_info.of_blocks) {
99 err = ext4_orphan_file_add(handle, inode);
101 * Fallback to normal orphan list of orphan file is
108 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
109 err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
114 err = ext4_reserve_inode_write(handle, inode, &iloc);
118 mutex_lock(&sbi->s_orphan_lock);
120 * Due to previous errors inode may be already a part of on-disk
121 * orphan list. If so skip on-disk list modification.
123 if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
124 (le32_to_cpu(sbi->s_es->s_inodes_count))) {
125 /* Insert this inode at the head of the on-disk orphan list */
126 NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
127 lock_buffer(sbi->s_sbh);
128 sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
129 ext4_superblock_csum_set(sb);
130 unlock_buffer(sbi->s_sbh);
133 list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
134 mutex_unlock(&sbi->s_orphan_lock);
137 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
138 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
143 * We have to remove inode from in-memory list if
144 * addition to on disk orphan list failed. Stray orphan
145 * list entries can cause panics at unmount time.
147 mutex_lock(&sbi->s_orphan_lock);
148 list_del_init(&EXT4_I(inode)->i_orphan);
149 mutex_unlock(&sbi->s_orphan_lock);
154 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
155 jbd_debug(4, "orphan inode %lu will point to %d\n",
156 inode->i_ino, NEXT_ORPHAN(inode));
158 ext4_std_error(sb, err);
162 static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
164 struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
167 int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
172 blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
173 off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
174 if (WARN_ON_ONCE(blk >= oi->of_blocks))
177 ret = ext4_journal_get_write_access(handle, inode->i_sb,
178 oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
182 bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
183 spin_lock(&oi->of_lock);
185 oi->of_binfo[blk].ob_free_entries++;
186 spin_unlock(&oi->of_lock);
187 ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
189 ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
190 INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
196 * ext4_orphan_del() removes an unlinked or truncated inode from the list
197 * of such inodes stored on disk, because it is finally being cleaned up.
199 int ext4_orphan_del(handle_t *handle, struct inode *inode)
201 struct list_head *prev;
202 struct ext4_inode_info *ei = EXT4_I(inode);
203 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
205 struct ext4_iloc iloc;
208 if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
211 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
212 !inode_is_locked(inode));
213 if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
214 return ext4_orphan_file_del(handle, inode);
216 /* Do this quick check before taking global s_orphan_lock. */
217 if (list_empty(&ei->i_orphan))
221 /* Grab inode buffer early before taking global s_orphan_lock */
222 err = ext4_reserve_inode_write(handle, inode, &iloc);
225 mutex_lock(&sbi->s_orphan_lock);
226 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
228 prev = ei->i_orphan.prev;
229 list_del_init(&ei->i_orphan);
231 /* If we're on an error path, we may not have a valid
232 * transaction handle with which to update the orphan list on
233 * disk, but we still need to remove the inode from the linked
235 if (!handle || err) {
236 mutex_unlock(&sbi->s_orphan_lock);
240 ino_next = NEXT_ORPHAN(inode);
241 if (prev == &sbi->s_orphan) {
242 jbd_debug(4, "superblock will point to %u\n", ino_next);
243 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
244 err = ext4_journal_get_write_access(handle, inode->i_sb,
245 sbi->s_sbh, EXT4_JTR_NONE);
247 mutex_unlock(&sbi->s_orphan_lock);
250 lock_buffer(sbi->s_sbh);
251 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
252 ext4_superblock_csum_set(inode->i_sb);
253 unlock_buffer(sbi->s_sbh);
254 mutex_unlock(&sbi->s_orphan_lock);
255 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
257 struct ext4_iloc iloc2;
258 struct inode *i_prev =
259 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
261 jbd_debug(4, "orphan inode %lu will point to %u\n",
262 i_prev->i_ino, ino_next);
263 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
265 mutex_unlock(&sbi->s_orphan_lock);
268 NEXT_ORPHAN(i_prev) = ino_next;
269 err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
270 mutex_unlock(&sbi->s_orphan_lock);
274 NEXT_ORPHAN(inode) = 0;
275 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
277 ext4_std_error(inode->i_sb, err);
286 static int ext4_quota_on_mount(struct super_block *sb, int type)
288 return dquot_quota_on_mount(sb,
289 rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
290 lockdep_is_held(&sb->s_umount)),
291 EXT4_SB(sb)->s_jquota_fmt, type);
295 static void ext4_process_orphan(struct inode *inode,
296 int *nr_truncates, int *nr_orphans)
298 struct super_block *sb = inode->i_sb;
301 dquot_initialize(inode);
302 if (inode->i_nlink) {
303 if (test_opt(sb, DEBUG))
304 ext4_msg(sb, KERN_DEBUG,
305 "%s: truncating inode %lu to %lld bytes",
306 __func__, inode->i_ino, inode->i_size);
307 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
308 inode->i_ino, inode->i_size);
310 truncate_inode_pages(inode->i_mapping, inode->i_size);
311 ret = ext4_truncate(inode);
314 * We need to clean up the in-core orphan list
315 * manually if ext4_truncate() failed to get a
316 * transaction handle.
318 ext4_orphan_del(NULL, inode);
319 ext4_std_error(inode->i_sb, ret);
324 if (test_opt(sb, DEBUG))
325 ext4_msg(sb, KERN_DEBUG,
326 "%s: deleting unreferenced inode %lu",
327 __func__, inode->i_ino);
328 jbd_debug(2, "deleting unreferenced inode %lu\n",
332 iput(inode); /* The delete magic happens here! */
335 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
336 * the superblock) which were deleted from all directories, but held open by
337 * a process at the time of a crash. We walk the list and try to delete these
338 * inodes at recovery time (only with a read-write filesystem).
340 * In order to keep the orphan inode chain consistent during traversal (in
341 * case of crash during recovery), we link each inode into the superblock
342 * orphan list_head and handle it the same way as an inode deletion during
343 * normal operation (which journals the operations for us).
345 * We only do an iget() and an iput() on each inode, which is very safe if we
346 * accidentally point at an in-use or already deleted inode. The worst that
347 * can happen in this case is that we get a "bit already cleared" message from
348 * ext4_free_inode(). The only reason we would point at a wrong inode is if
349 * e2fsck was run on this filesystem, and it must have already done the orphan
350 * inode cleanup for us, so we can safely abort without any further action.
352 void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
354 unsigned int s_flags = sb->s_flags;
355 int nr_orphans = 0, nr_truncates = 0;
359 int quota_update = 0;
362 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
363 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
365 if (!es->s_last_orphan && !oi->of_blocks) {
366 jbd_debug(4, "no orphan inodes to clean up\n");
370 if (bdev_read_only(sb->s_bdev)) {
371 ext4_msg(sb, KERN_ERR, "write access "
372 "unavailable, skipping orphan cleanup");
376 /* Check if feature set would not allow a r/w mount */
377 if (!ext4_feature_set_ok(sb, 0)) {
378 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
379 "unknown ROCOMPAT features");
383 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
384 /* don't clear list on RO mount w/ errors */
385 if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
386 ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
387 "clearing orphan list.\n");
388 es->s_last_orphan = 0;
390 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
394 if (s_flags & SB_RDONLY) {
395 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
396 sb->s_flags &= ~SB_RDONLY;
400 * Turn on quotas which were not enabled for read-only mounts if
401 * filesystem has quota feature, so that they are updated correctly.
403 if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
404 int ret = ext4_enable_quotas(sb);
409 ext4_msg(sb, KERN_ERR,
410 "Cannot turn on quotas: error %d", ret);
413 /* Turn on journaled quotas used for old sytle */
414 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
415 if (EXT4_SB(sb)->s_qf_names[i]) {
416 int ret = ext4_quota_on_mount(sb, i);
421 ext4_msg(sb, KERN_ERR,
422 "Cannot turn on journaled "
423 "quota: type %d: error %d", i, ret);
428 while (es->s_last_orphan) {
430 * We may have encountered an error during cleanup; if
433 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
434 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
435 es->s_last_orphan = 0;
439 inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
441 es->s_last_orphan = 0;
445 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
446 ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
449 for (i = 0; i < oi->of_blocks; i++) {
450 bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
451 for (j = 0; j < inodes_per_ob; j++) {
454 inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
457 ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
458 EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
459 ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
463 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
466 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
469 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
470 PLURAL(nr_truncates));
472 /* Turn off quotas if they were enabled for orphan cleanup */
474 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
475 if (sb_dqopt(sb)->files[i])
476 dquot_quota_off(sb, i);
480 sb->s_flags = s_flags; /* Restore SB_RDONLY status */
483 void ext4_release_orphan_info(struct super_block *sb)
486 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
490 for (i = 0; i < oi->of_blocks; i++)
491 brelse(oi->of_binfo[i].ob_bh);
495 static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
496 struct super_block *sb,
497 struct buffer_head *bh)
499 return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
500 sizeof(struct ext4_orphan_block_tail));
503 static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
504 struct buffer_head *bh)
507 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
508 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
509 struct ext4_orphan_block_tail *ot;
510 __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
512 if (!ext4_has_metadata_csum(sb))
515 ot = ext4_orphan_block_tail(sb, bh);
516 calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
517 (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
518 calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
519 inodes_per_ob * sizeof(__u32));
520 return le32_to_cpu(ot->ob_checksum) == calculated;
523 /* This gets called only when checksumming is enabled */
524 void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
525 struct buffer_head *bh,
526 void *data, size_t size)
528 struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
530 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
531 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
532 struct ext4_orphan_block_tail *ot;
533 __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
535 csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
536 (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
537 csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
538 inodes_per_ob * sizeof(__u32));
539 ot = ext4_orphan_block_tail(sb, bh);
540 ot->ob_checksum = cpu_to_le32(csum);
543 int ext4_init_orphan_info(struct super_block *sb)
545 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
551 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
552 struct ext4_orphan_block_tail *ot;
553 ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
555 spin_lock_init(&oi->of_lock);
557 if (!ext4_has_feature_orphan_file(sb))
560 inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
562 ext4_msg(sb, KERN_ERR, "get orphan inode failed");
563 return PTR_ERR(inode);
565 oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
566 oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
567 oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
573 for (i = 0; i < oi->of_blocks; i++) {
574 oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
575 if (IS_ERR(oi->of_binfo[i].ob_bh)) {
576 ret = PTR_ERR(oi->of_binfo[i].ob_bh);
579 if (!oi->of_binfo[i].ob_bh) {
583 ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
584 if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
585 ext4_error(sb, "orphan file block %d: bad magic", i);
589 if (!ext4_orphan_file_block_csum_verify(sb,
590 oi->of_binfo[i].ob_bh)) {
591 ext4_error(sb, "orphan file block %d: bad checksum", i);
595 bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
597 for (j = 0; j < inodes_per_ob; j++)
600 oi->of_binfo[i].ob_free_entries = free;
605 for (i--; i >= 0; i--)
606 brelse(oi->of_binfo[i].ob_bh);
613 int ext4_orphan_file_empty(struct super_block *sb)
615 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
617 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
619 if (!ext4_has_feature_orphan_file(sb))
621 for (i = 0; i < oi->of_blocks; i++)
622 if (oi->of_binfo[i].ob_free_entries != inodes_per_ob)