Linux 6.9-rc1
[linux-2.6-microblaze.git] / block / fops.c
index b907425..679d9b7 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/falloc.h>
 #include <linux/suspend.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/module.h>
 #include "blk.h"
 
@@ -23,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file)
        return file->f_mapping->host;
 }
 
-static int blkdev_get_block(struct inode *inode, sector_t iblock,
-               struct buffer_head *bh, int create)
-{
-       bh->b_bdev = I_BDEV(inode);
-       bh->b_blocknr = iblock;
-       set_buffer_mapped(bh);
-       return 0;
-}
-
 static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
 {
        blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@@ -54,7 +46,7 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
                struct iov_iter *iter, unsigned int nr_pages)
 {
-       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
        loff_t pos = iocb->ki_pos;
        bool should_dirty = false;
@@ -81,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
                bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
        }
        bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+       bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
        bio.bi_ioprio = iocb->ki_ioprio;
 
        ret = bio_iov_iter_get_pages(&bio, iter);
@@ -170,7 +163,7 @@ static void blkdev_bio_end_io(struct bio *bio)
 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                unsigned int nr_pages)
 {
-       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        struct blk_plug plug;
        struct blkdev_dio *dio;
        struct bio *bio;
@@ -211,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
        for (;;) {
                bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+               bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
                bio->bi_private = dio;
                bio->bi_end_io = blkdev_bio_end_io;
                bio->bi_ioprio = iocb->ki_ioprio;
@@ -221,6 +215,24 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                        bio_endio(bio);
                        break;
                }
+               if (iocb->ki_flags & IOCB_NOWAIT) {
+                       /*
+                        * This is nonblocking IO, and we need to allocate
+                        * another bio if we have data left to map. As we
+                        * cannot guarantee that one of the sub bios will not
+                        * fail getting issued FOR NOWAIT and as error results
+                        * are coalesced across all of them, be safe and ask for
+                        * a retry of this from blocking context.
+                        */
+                       if (unlikely(iov_iter_count(iter))) {
+                               bio_release_pages(bio, false);
+                               bio_clear_flag(bio, BIO_REFFED);
+                               bio_put(bio);
+                               blk_finish_plug(&plug);
+                               return -EAGAIN;
+                       }
+                       bio->bi_opf |= REQ_NOWAIT;
+               }
 
                if (is_read) {
                        if (dio->flags & DIO_SHOULD_DIRTY)
@@ -228,9 +240,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                } else {
                        task_io_account_write(bio->bi_iter.bi_size);
                }
-               if (iocb->ki_flags & IOCB_NOWAIT)
-                       bio->bi_opf |= REQ_NOWAIT;
-
                dio->size += bio->bi_iter.bi_size;
                pos += bio->bi_iter.bi_size;
 
@@ -295,7 +304,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
                                        struct iov_iter *iter,
                                        unsigned int nr_pages)
 {
-       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        bool is_read = iov_iter_rw(iter) == READ;
        blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
        struct blkdev_dio *dio;
@@ -314,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
        dio->flags = 0;
        dio->iocb = iocb;
        bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+       bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
        bio->bi_end_io = blkdev_bio_end_io_async;
        bio->bi_ioprio = iocb->ki_ioprio;
 
@@ -343,13 +353,14 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
                task_io_account_write(bio->bi_iter.bi_size);
        }
 
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               bio->bi_opf |= REQ_NOWAIT;
+
        if (iocb->ki_flags & IOCB_HIPRI) {
-               bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+               bio->bi_opf |= REQ_POLLED;
                submit_bio(bio);
                WRITE_ONCE(iocb->private, bio);
        } else {
-               if (iocb->ki_flags & IOCB_NOWAIT)
-                       bio->bi_opf |= REQ_NOWAIT;
                submit_bio(bio);
        }
        return -EIOCBQUEUED;
@@ -371,9 +382,55 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
 
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+               unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+       struct block_device *bdev = I_BDEV(inode);
+       loff_t isize = i_size_read(inode);
+
+       iomap->bdev = bdev;
+       iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+       if (iomap->offset >= isize)
+               return -EIO;
+       iomap->type = IOMAP_MAPPED;
+       iomap->addr = iomap->offset;
+       iomap->length = isize - iomap->offset;
+       iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
+       return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+       .iomap_begin            = blkdev_iomap_begin,
+};
+
+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+               struct buffer_head *bh, int create)
 {
-       return block_write_full_page(page, blkdev_get_block, wbc);
+       bh->b_bdev = I_BDEV(inode);
+       bh->b_blocknr = iblock;
+       set_buffer_mapped(bh);
+       return 0;
+}
+
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock.  The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+               struct writeback_control *wbc)
+{
+       struct blk_plug plug;
+       int err;
+
+       blk_start_plug(&plug);
+       err = write_cache_pages(mapping, wbc, block_write_full_folio,
+                       blkdev_get_block);
+       blk_finish_plug(&plug);
+
+       return err;
 }
 
 static int blkdev_read_folio(struct file *file, struct folio *folio)
@@ -405,25 +462,66 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
 
-static int blkdev_writepages(struct address_space *mapping,
-                            struct writeback_control *wbc)
-{
-       return generic_writepages(mapping, wbc);
-}
-
 const struct address_space_operations def_blk_aops = {
        .dirty_folio    = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio     = blkdev_read_folio,
        .readahead      = blkdev_readahead,
-       .writepage      = blkdev_writepage,
+       .writepages     = blkdev_writepages,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
-       .writepages     = blkdev_writepages,
-       .direct_IO      = blkdev_direct_IO,
        .migrate_folio  = buffer_migrate_folio_norefs,
        .is_dirty_writeback = buffer_check_dirty_writeback,
 };
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+       return iomap_read_folio(folio, &blkdev_iomap_ops);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+       iomap_readahead(rac, &blkdev_iomap_ops);
+}
+
+static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
+               struct inode *inode, loff_t offset, unsigned int len)
+{
+       loff_t isize = i_size_read(inode);
+
+       if (WARN_ON_ONCE(offset >= isize))
+               return -EIO;
+       if (offset >= wpc->iomap.offset &&
+           offset < wpc->iomap.offset + wpc->iomap.length)
+               return 0;
+       return blkdev_iomap_begin(inode, offset, isize - offset,
+                                 IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+       .map_blocks             = blkdev_map_blocks,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+               struct writeback_control *wbc)
+{
+       struct iomap_writepage_ctx wpc = { };
+
+       return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+}
+
+const struct address_space_operations def_blk_aops = {
+       .dirty_folio    = filemap_dirty_folio,
+       .release_folio          = iomap_release_folio,
+       .invalidate_folio       = iomap_invalidate_folio,
+       .read_folio             = blkdev_read_folio,
+       .readahead              = blkdev_readahead,
+       .writepages             = blkdev_writepages,
+       .is_partially_uptodate  = iomap_is_partially_uptodate,
+       .error_remove_folio     = generic_error_remove_folio,
+       .migrate_folio          = filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */
 
 /*
  * for a block special file file_inode(file)->i_size is zero
@@ -443,7 +541,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
 static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                int datasync)
 {
-       struct block_device *bdev = filp->private_data;
+       struct block_device *bdev = I_BDEV(filp->f_mapping->host);
        int error;
 
        error = file_write_and_wait_range(filp, start, end);
@@ -462,44 +560,104 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
        return error;
 }
 
-static int blkdev_open(struct inode *inode, struct file *filp)
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
+blk_mode_t file_to_blk_mode(struct file *file)
 {
-       struct block_device *bdev;
+       blk_mode_t mode = 0;
 
+       if (file->f_mode & FMODE_READ)
+               mode |= BLK_OPEN_READ;
+       if (file->f_mode & FMODE_WRITE)
+               mode |= BLK_OPEN_WRITE;
        /*
-        * Preserve backwards compatibility and allow large file access
-        * even if userspace doesn't ask for it explicitly. Some mkfs
-        * binary needs it. We might want to drop this workaround
-        * during an unstable branch.
+        * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
+        * to determine whether the open was exclusive for already open files.
         */
-       filp->f_flags |= O_LARGEFILE;
-       filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
-
-       if (filp->f_flags & O_NDELAY)
-               filp->f_mode |= FMODE_NDELAY;
-       if (filp->f_flags & O_EXCL)
-               filp->f_mode |= FMODE_EXCL;
-       if ((filp->f_flags & O_ACCMODE) == 3)
-               filp->f_mode |= FMODE_WRITE_IOCTL;
-
-       bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
-       if (IS_ERR(bdev))
-               return PTR_ERR(bdev);
-
-       filp->private_data = bdev;
-       filp->f_mapping = bdev->bd_inode->i_mapping;
-       filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
-       return 0;
+       if (file->private_data)
+               mode |= BLK_OPEN_EXCL;
+       else if (file->f_flags & O_EXCL)
+               mode |= BLK_OPEN_EXCL;
+       if (file->f_flags & O_NDELAY)
+               mode |= BLK_OPEN_NDELAY;
+
+       /*
+        * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy
+        * driver has historically allowed ioctls as if the file was opened for
+        * writing, but does not allow and actual reads or writes.
+        */
+       if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY))
+               mode |= BLK_OPEN_WRITE_IOCTL;
+
+       return mode;
 }
 
-static int blkdev_close(struct inode *inode, struct file *filp)
+static int blkdev_open(struct inode *inode, struct file *filp)
 {
-       struct block_device *bdev = filp->private_data;
+       struct block_device *bdev;
+       blk_mode_t mode;
+       int ret;
+
+       mode = file_to_blk_mode(filp);
+       /* Use the file as the holder. */
+       if (mode & BLK_OPEN_EXCL)
+               filp->private_data = filp;
+       ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
+       if (ret)
+               return ret;
+
+       bdev = blkdev_get_no_open(inode->i_rdev);
+       if (!bdev)
+               return -ENXIO;
+
+       ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
+       if (ret)
+               blkdev_put_no_open(bdev);
+       return ret;
+}
 
-       blkdev_put(bdev, filp->f_mode);
+static int blkdev_release(struct inode *inode, struct file *filp)
+{
+       bdev_release(filp);
        return 0;
 }
 
+static ssize_t
+blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+       size_t count = iov_iter_count(from);
+       ssize_t written;
+
+       written = kiocb_invalidate_pages(iocb, count);
+       if (written) {
+               if (written == -EBUSY)
+                       return 0;
+               return written;
+       }
+
+       written = blkdev_direct_IO(iocb, from);
+       if (written > 0) {
+               kiocb_invalidate_post_direct_write(iocb, count);
+               iocb->ki_pos += written;
+               count -= written;
+       }
+       if (written != -EIOCBQUEUED)
+               iov_iter_revert(from, count - iov_iter_count(from));
+       return written;
+}
+
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+       return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -509,10 +667,10 @@ static int blkdev_close(struct inode *inode, struct file *filp)
  */
 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct file *file = iocb->ki_filp;
+       struct block_device *bdev = I_BDEV(file->f_mapping->host);
        struct inode *bd_inode = bdev->bd_inode;
        loff_t size = bdev_nr_bytes(bdev);
-       struct blk_plug plug;
        size_t shorted = 0;
        ssize_t ret;
 
@@ -537,18 +695,28 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
                iov_iter_truncate(from, size);
        }
 
-       blk_start_plug(&plug);
-       ret = __generic_file_write_iter(iocb, from);
+       ret = file_update_time(file);
+       if (ret)
+               return ret;
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               ret = blkdev_direct_write(iocb, from);
+               if (ret >= 0 && iov_iter_count(from))
+                       ret = direct_write_fallback(iocb, from, ret,
+                                       blkdev_buffered_write(iocb, from));
+       } else {
+               ret = blkdev_buffered_write(iocb, from);
+       }
+
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        iov_iter_reexpand(from, iov_iter_count(from) + shorted);
-       blk_finish_plug(&plug);
        return ret;
 }
 
 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-       struct block_device *bdev = iocb->ki_filp->private_data;
+       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        loff_t size = bdev_nr_bytes(bdev);
        loff_t pos = iocb->ki_pos;
        size_t shorted = 0;
@@ -568,21 +736,9 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
                goto reexpand; /* skip atime */
 
        if (iocb->ki_flags & IOCB_DIRECT) {
-               struct address_space *mapping = iocb->ki_filp->f_mapping;
-
-               if (iocb->ki_flags & IOCB_NOWAIT) {
-                       if (filemap_range_needs_writeback(mapping, pos,
-                                                         pos + count - 1)) {
-                               ret = -EAGAIN;
-                               goto reexpand;
-                       }
-               } else {
-                       ret = filemap_write_and_wait_range(mapping, pos,
-                                                          pos + count - 1);
-                       if (ret < 0)
-                               goto reexpand;
-               }
-
+               ret = kiocb_write_and_wait(iocb, count);
+               if (ret < 0)
+                       goto reexpand;
                file_accessed(iocb->ki_filp);
 
                ret = blkdev_direct_IO(iocb, to);
@@ -640,24 +796,35 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 
        filemap_invalidate_lock(inode->i_mapping);
 
-       /* Invalidate the page cache, including dirty pages. */
-       error = truncate_bdev_range(bdev, file->f_mode, start, end);
-       if (error)
-               goto fail;
-
+       /*
+        * Invalidate the page cache, including dirty pages, for valid
+        * de-allocate mode calls to fallocate().
+        */
        switch (mode) {
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
+               error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+               if (error)
+                       goto fail;
+
                error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
                                             len >> SECTOR_SHIFT, GFP_KERNEL,
                                             BLKDEV_ZERO_NOUNMAP);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+               error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+               if (error)
+                       goto fail;
+
                error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
                                             len >> SECTOR_SHIFT, GFP_KERNEL,
                                             BLKDEV_ZERO_NOFALLBACK);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
+               error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+               if (error)
+                       goto fail;
+
                error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
                                             len >> SECTOR_SHIFT, GFP_KERNEL);
                break;
@@ -670,20 +837,30 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
        return error;
 }
 
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct inode *bd_inode = bdev_file_inode(file);
+
+       if (bdev_read_only(I_BDEV(bd_inode)))
+               return generic_file_readonly_mmap(file, vma);
+
+       return generic_file_mmap(file, vma);
+}
+
 const struct file_operations def_blk_fops = {
        .open           = blkdev_open,
-       .release        = blkdev_close,
+       .release        = blkdev_release,
        .llseek         = blkdev_llseek,
        .read_iter      = blkdev_read_iter,
        .write_iter     = blkdev_write_iter,
        .iopoll         = iocb_bio_iopoll,
-       .mmap           = generic_file_mmap,
+       .mmap           = blkdev_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = blkdev_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
 #endif
-       .splice_read    = generic_file_splice_read,
+       .splice_read    = filemap_splice_read,
        .splice_write   = iter_file_splice_write,
        .fallocate      = blkdev_fallocate,
 };