2 * Ext4 orphan inode handling
5 #include <linux/quotaops.h>
6 #include <linux/buffer_head.h>
11 static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
14 struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
18 int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
22 * Find block with free orphan entry. Use CPU number for a naive hash
23 * for a search start in the orphan file
25 start = raw_smp_processor_id()*13 % oi->of_blocks;
28 if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
33 if (++i >= oi->of_blocks)
39 * For now we don't grow or shrink orphan file. We just use
40 * whatever was allocated at mke2fs time. The additional
41 * credits we would have to reserve for each orphan inode
42 * operation just don't seem worth it.
47 ret = ext4_journal_get_write_access(handle, inode->i_sb,
48 oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
50 atomic_inc(&oi->of_binfo[i].ob_free_entries);
54 bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
55 /* Find empty slot in a block */
60 * Did we walk through the block several times without
61 * finding free entry? It is theoretically possible
62 * if entries get constantly allocated and freed or
63 * if the block is corrupted. Avoid indefinite looping
64 * and bail. We'll use orphan list instead.
67 atomic_inc(&oi->of_binfo[i].ob_free_entries);
73 if (++j >= inodes_per_ob) {
78 } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
81 EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
82 ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
84 return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
88 * ext4_orphan_add() links an unlinked or truncated inode into a list of
89 * such inodes, starting at the superblock, in case we crash before the
90 * file is closed/deleted, or in case the inode truncate spans multiple
91 * transactions and the last transaction is not recovered after a crash.
93 * At filesystem recovery time, we walk this list deleting unlinked
94 * inodes and truncating linked inodes in ext4_orphan_cleanup().
96 * Orphan list manipulation functions must be called under i_mutex unless
97 * we are just creating the inode or deleting it.
99 int ext4_orphan_add(handle_t *handle, struct inode *inode)
101 struct super_block *sb = inode->i_sb;
102 struct ext4_sb_info *sbi = EXT4_SB(sb);
103 struct ext4_iloc iloc;
107 if (!sbi->s_journal || is_bad_inode(inode))
110 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
111 !inode_is_locked(inode));
113 * Inode orphaned in orphan file or in orphan list?
115 if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
116 !list_empty(&EXT4_I(inode)->i_orphan))
120 * Orphan handling is only valid for files with data blocks
121 * being truncated, or files being unlinked. Note that we either
122 * hold i_mutex, or the inode can not be referenced from outside,
123 * so i_nlink should not be bumped due to race
125 ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
126 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
128 if (sbi->s_orphan_info.of_blocks) {
129 err = ext4_orphan_file_add(handle, inode);
131 * Fallback to normal orphan list of orphan file is
138 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
139 err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
144 err = ext4_reserve_inode_write(handle, inode, &iloc);
148 mutex_lock(&sbi->s_orphan_lock);
150 * Due to previous errors inode may be already a part of on-disk
151 * orphan list. If so skip on-disk list modification.
153 if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
154 (le32_to_cpu(sbi->s_es->s_inodes_count))) {
155 /* Insert this inode at the head of the on-disk orphan list */
156 NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
157 lock_buffer(sbi->s_sbh);
158 sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
159 ext4_superblock_csum_set(sb);
160 unlock_buffer(sbi->s_sbh);
163 list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
164 mutex_unlock(&sbi->s_orphan_lock);
167 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
168 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
173 * We have to remove inode from in-memory list if
174 * addition to on disk orphan list failed. Stray orphan
175 * list entries can cause panics at unmount time.
177 mutex_lock(&sbi->s_orphan_lock);
178 list_del_init(&EXT4_I(inode)->i_orphan);
179 mutex_unlock(&sbi->s_orphan_lock);
184 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
185 jbd_debug(4, "orphan inode %lu will point to %d\n",
186 inode->i_ino, NEXT_ORPHAN(inode));
188 ext4_std_error(sb, err);
192 static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
194 struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
197 int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
202 blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
203 off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
204 if (WARN_ON_ONCE(blk >= oi->of_blocks))
207 ret = ext4_journal_get_write_access(handle, inode->i_sb,
208 oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
212 bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
214 atomic_inc(&oi->of_binfo[blk].ob_free_entries);
215 ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
217 ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
218 INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
224 * ext4_orphan_del() removes an unlinked or truncated inode from the list
225 * of such inodes stored on disk, because it is finally being cleaned up.
227 int ext4_orphan_del(handle_t *handle, struct inode *inode)
229 struct list_head *prev;
230 struct ext4_inode_info *ei = EXT4_I(inode);
231 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
233 struct ext4_iloc iloc;
236 if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
239 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
240 !inode_is_locked(inode));
241 if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
242 return ext4_orphan_file_del(handle, inode);
244 /* Do this quick check before taking global s_orphan_lock. */
245 if (list_empty(&ei->i_orphan))
249 /* Grab inode buffer early before taking global s_orphan_lock */
250 err = ext4_reserve_inode_write(handle, inode, &iloc);
253 mutex_lock(&sbi->s_orphan_lock);
254 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
256 prev = ei->i_orphan.prev;
257 list_del_init(&ei->i_orphan);
259 /* If we're on an error path, we may not have a valid
260 * transaction handle with which to update the orphan list on
261 * disk, but we still need to remove the inode from the linked
263 if (!handle || err) {
264 mutex_unlock(&sbi->s_orphan_lock);
268 ino_next = NEXT_ORPHAN(inode);
269 if (prev == &sbi->s_orphan) {
270 jbd_debug(4, "superblock will point to %u\n", ino_next);
271 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
272 err = ext4_journal_get_write_access(handle, inode->i_sb,
273 sbi->s_sbh, EXT4_JTR_NONE);
275 mutex_unlock(&sbi->s_orphan_lock);
278 lock_buffer(sbi->s_sbh);
279 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
280 ext4_superblock_csum_set(inode->i_sb);
281 unlock_buffer(sbi->s_sbh);
282 mutex_unlock(&sbi->s_orphan_lock);
283 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
285 struct ext4_iloc iloc2;
286 struct inode *i_prev =
287 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
289 jbd_debug(4, "orphan inode %lu will point to %u\n",
290 i_prev->i_ino, ino_next);
291 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
293 mutex_unlock(&sbi->s_orphan_lock);
296 NEXT_ORPHAN(i_prev) = ino_next;
297 err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
298 mutex_unlock(&sbi->s_orphan_lock);
302 NEXT_ORPHAN(inode) = 0;
303 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
305 ext4_std_error(inode->i_sb, err);
314 static int ext4_quota_on_mount(struct super_block *sb, int type)
316 return dquot_quota_on_mount(sb,
317 rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
318 lockdep_is_held(&sb->s_umount)),
319 EXT4_SB(sb)->s_jquota_fmt, type);
323 static void ext4_process_orphan(struct inode *inode,
324 int *nr_truncates, int *nr_orphans)
326 struct super_block *sb = inode->i_sb;
329 dquot_initialize(inode);
330 if (inode->i_nlink) {
331 if (test_opt(sb, DEBUG))
332 ext4_msg(sb, KERN_DEBUG,
333 "%s: truncating inode %lu to %lld bytes",
334 __func__, inode->i_ino, inode->i_size);
335 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
336 inode->i_ino, inode->i_size);
338 truncate_inode_pages(inode->i_mapping, inode->i_size);
339 ret = ext4_truncate(inode);
342 * We need to clean up the in-core orphan list
343 * manually if ext4_truncate() failed to get a
344 * transaction handle.
346 ext4_orphan_del(NULL, inode);
347 ext4_std_error(inode->i_sb, ret);
352 if (test_opt(sb, DEBUG))
353 ext4_msg(sb, KERN_DEBUG,
354 "%s: deleting unreferenced inode %lu",
355 __func__, inode->i_ino);
356 jbd_debug(2, "deleting unreferenced inode %lu\n",
360 iput(inode); /* The delete magic happens here! */
363 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
364 * the superblock) which were deleted from all directories, but held open by
365 * a process at the time of a crash. We walk the list and try to delete these
366 * inodes at recovery time (only with a read-write filesystem).
368 * In order to keep the orphan inode chain consistent during traversal (in
369 * case of crash during recovery), we link each inode into the superblock
370 * orphan list_head and handle it the same way as an inode deletion during
371 * normal operation (which journals the operations for us).
373 * We only do an iget() and an iput() on each inode, which is very safe if we
374 * accidentally point at an in-use or already deleted inode. The worst that
375 * can happen in this case is that we get a "bit already cleared" message from
376 * ext4_free_inode(). The only reason we would point at a wrong inode is if
377 * e2fsck was run on this filesystem, and it must have already done the orphan
378 * inode cleanup for us, so we can safely abort without any further action.
380 void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
382 unsigned int s_flags = sb->s_flags;
383 int nr_orphans = 0, nr_truncates = 0;
387 int quota_update = 0;
390 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
391 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
393 if (!es->s_last_orphan && !oi->of_blocks) {
394 jbd_debug(4, "no orphan inodes to clean up\n");
398 if (bdev_read_only(sb->s_bdev)) {
399 ext4_msg(sb, KERN_ERR, "write access "
400 "unavailable, skipping orphan cleanup");
404 /* Check if feature set would not allow a r/w mount */
405 if (!ext4_feature_set_ok(sb, 0)) {
406 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
407 "unknown ROCOMPAT features");
411 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
412 /* don't clear list on RO mount w/ errors */
413 if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
414 ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
415 "clearing orphan list.\n");
416 es->s_last_orphan = 0;
418 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
422 if (s_flags & SB_RDONLY) {
423 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
424 sb->s_flags &= ~SB_RDONLY;
428 * Turn on quotas which were not enabled for read-only mounts if
429 * filesystem has quota feature, so that they are updated correctly.
431 if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
432 int ret = ext4_enable_quotas(sb);
437 ext4_msg(sb, KERN_ERR,
438 "Cannot turn on quotas: error %d", ret);
441 /* Turn on journaled quotas used for old sytle */
442 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
443 if (EXT4_SB(sb)->s_qf_names[i]) {
444 int ret = ext4_quota_on_mount(sb, i);
449 ext4_msg(sb, KERN_ERR,
450 "Cannot turn on journaled "
451 "quota: type %d: error %d", i, ret);
456 while (es->s_last_orphan) {
458 * We may have encountered an error during cleanup; if
461 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
462 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
463 es->s_last_orphan = 0;
467 inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
469 es->s_last_orphan = 0;
473 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
474 ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
477 for (i = 0; i < oi->of_blocks; i++) {
478 bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
479 for (j = 0; j < inodes_per_ob; j++) {
482 inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
485 ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
486 EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
487 ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
491 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
494 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
497 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
498 PLURAL(nr_truncates));
500 /* Turn off quotas if they were enabled for orphan cleanup */
502 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
503 if (sb_dqopt(sb)->files[i])
504 dquot_quota_off(sb, i);
508 sb->s_flags = s_flags; /* Restore SB_RDONLY status */
511 void ext4_release_orphan_info(struct super_block *sb)
514 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
518 for (i = 0; i < oi->of_blocks; i++)
519 brelse(oi->of_binfo[i].ob_bh);
523 static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
524 struct super_block *sb,
525 struct buffer_head *bh)
527 return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
528 sizeof(struct ext4_orphan_block_tail));
531 static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
532 struct buffer_head *bh)
535 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
536 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
537 struct ext4_orphan_block_tail *ot;
538 __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
540 if (!ext4_has_metadata_csum(sb))
543 ot = ext4_orphan_block_tail(sb, bh);
544 calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
545 (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
546 calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
547 inodes_per_ob * sizeof(__u32));
548 return le32_to_cpu(ot->ob_checksum) == calculated;
551 /* This gets called only when checksumming is enabled */
552 void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
553 struct buffer_head *bh,
554 void *data, size_t size)
556 struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
558 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
559 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
560 struct ext4_orphan_block_tail *ot;
561 __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
563 csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
564 (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
565 csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
566 inodes_per_ob * sizeof(__u32));
567 ot = ext4_orphan_block_tail(sb, bh);
568 ot->ob_checksum = cpu_to_le32(csum);
571 int ext4_init_orphan_info(struct super_block *sb)
573 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
579 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
580 struct ext4_orphan_block_tail *ot;
581 ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
583 if (!ext4_has_feature_orphan_file(sb))
586 inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
588 ext4_msg(sb, KERN_ERR, "get orphan inode failed");
589 return PTR_ERR(inode);
591 oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
592 oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
593 oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
599 for (i = 0; i < oi->of_blocks; i++) {
600 oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
601 if (IS_ERR(oi->of_binfo[i].ob_bh)) {
602 ret = PTR_ERR(oi->of_binfo[i].ob_bh);
605 if (!oi->of_binfo[i].ob_bh) {
609 ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
610 if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
611 ext4_error(sb, "orphan file block %d: bad magic", i);
615 if (!ext4_orphan_file_block_csum_verify(sb,
616 oi->of_binfo[i].ob_bh)) {
617 ext4_error(sb, "orphan file block %d: bad checksum", i);
621 bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
623 for (j = 0; j < inodes_per_ob; j++)
626 atomic_set(&oi->of_binfo[i].ob_free_entries, free);
631 for (i--; i >= 0; i--)
632 brelse(oi->of_binfo[i].ob_bh);
639 int ext4_orphan_file_empty(struct super_block *sb)
641 struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
643 int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
645 if (!ext4_has_feature_orphan_file(sb))
647 for (i = 0; i < oi->of_blocks; i++)
648 if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=