pgoff_t idx;
int rc;
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the ctx->completion_lock. That does not work with the
+ * migration workflow of MIGRATE_SYNC_NO_COPY.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
rc = 0;
/* mapping->private_lock here protects against the kioctx teardown. */
#endif
};
-static int aio_setup_ring(struct kioctx *ctx)
+static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
{
struct aio_ring *ring;
- unsigned nr_events = ctx->max_reqs;
struct mm_struct *mm = current->mm;
unsigned long size, unused;
int nr_pages;
struct kioctx *ctx;
int err = -ENOMEM;
+ /*
+ * Store the original nr_events -- what userspace passed to io_setup(),
+ * for counting against the global limit -- before it changes.
+ */
+ unsigned int max_reqs = nr_events;
+
/*
* We keep track of the number of available ringbuffer slots, to prevent
* overflow (reqs_available), and we also use percpu counters for this.
return ERR_PTR(-EINVAL);
}
- if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
+ if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
return ERR_PTR(-EAGAIN);
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
if (!ctx)
return ERR_PTR(-ENOMEM);
- ctx->max_reqs = nr_events;
+ ctx->max_reqs = max_reqs;
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
if (!ctx->cpu)
goto err;
- err = aio_setup_ring(ctx);
+ err = aio_setup_ring(ctx, nr_events);
if (err < 0)
goto err;
/* limit the number of system wide aios */
spin_lock(&aio_nr_lock);
- if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
- aio_nr + nr_events < aio_nr) {
+ if (aio_nr + ctx->max_reqs > aio_max_nr ||
+ aio_nr + ctx->max_reqs < aio_nr) {
spin_unlock(&aio_nr_lock);
err = -EAGAIN;
goto err_ctx;
goto out_put_req;
}
- if ((req->common.ki_flags & IOCB_NOWAIT) &&
- !(req->common.ki_flags & IOCB_DIRECT)) {
- ret = -EOPNOTSUPP;
- goto out_put_req;
- }
-
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
}
bio_init(&bio, vecs, nr_pages);
- bio.bi_bdev = bdev;
+ bio_set_dev(&bio, bdev);
bio.bi_iter.bi_sector = pos >> 9;
bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
blk_start_plug(&plug);
for (;;) {
- bio->bi_bdev = bdev;
+ bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
+ bdev->bd_partno = partno;
if (!partno) {
ret = -ENXIO;
*/
filp->f_flags |= O_LARGEFILE;
+ filp->f_mode |= FMODE_NOWAIT;
+
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
if (filp->f_flags & O_EXCL)
if (iocb->ki_pos >= size)
return -ENOSPC;
+ if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
iov_iter_truncate(from, size - iocb->ki_pos);
blk_start_plug(&plug);
u64 num_bytes;
int ret;
- ret = btrfs_start_write_no_snapshoting(root);
+ ret = btrfs_start_write_no_snapshotting(root);
if (!ret)
return -ENOSPC;
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_end_write_no_snapshoting(root);
+ btrfs_end_write_no_snapshotting(root);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
data_reserved, pos,
write_bytes);
else
- btrfs_end_write_no_snapshoting(root);
+ btrfs_end_write_no_snapshotting(root);
break;
}
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_write_no_snapshoting(root);
+ btrfs_end_write_no_snapshotting(root);
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_write_no_snapshoting(root);
+ btrfs_end_write_no_snapshotting(root);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes);
} else {
loff_t oldsize;
int clean_page = 0;
+ if (!(iocb->ki_flags & IOCB_DIRECT) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
+
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
int btrfs_release_file(struct inode *inode, struct file *filp)
{
- if (filp->private_data)
+ struct btrfs_file_private *private = filp->private_data;
+
+ if (private && private->trans)
btrfs_ioctl_trans_end(filp);
+ if (private && private->filldir_buf)
+ kfree(private->filldir_buf);
+ kfree(private);
+ filp->private_data = NULL;
+
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_AIO_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT;
return generic_file_open(inode, filp);
}
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
+ if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ /*
+ * We have to distinguish real writes from writes which will result in a
+ * COW page; COW writes should *not* poke the journal (the file will not
+ * be changed). Doing so would cause unintended failures when mounted
+ * read-only.
+ *
+ * We check for VM_SHARED rather than vmf->cow_page since the latter is
+ * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ * other sizes, dax_iomap_fault will handle splitting / fallback so that
+ * we eventually come back with a COW page.
+ */
+ bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
if (write) {
sb_start_pagefault(sb);
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct super_block *sb = inode->i_sb;
- loff_t size;
- int ret;
-
- sb_start_pagefault(sb);
- file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else
- ret = dax_pfn_mkwrite(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- sb_end_pagefault(sb);
-
- return ret;
-}
-
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.huge_fault = ext4_dax_huge_fault,
.page_mkwrite = ext4_dax_fault,
- .pfn_mkwrite = ext4_dax_pfn_mkwrite,
+ .pfn_mkwrite = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
return -EIO;
if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
- !(sb->s_flags & MS_RDONLY))) {
+ !sb_rdonly(sb))) {
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
/*
* Sample where the filesystem has been mounted and
return ret;
}
- /* Set the flags to support nowait AIO */
- filp->f_mode |= FMODE_AIO_NOWAIT;
-
+ filp->f_mode |= FMODE_NOWAIT;
return dquot_file_open(inode, filp);
}
pagevec_init(&pvec, 0);
do {
- int i, num;
+ int i;
unsigned long nr_pages;
- num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- (pgoff_t)num);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+ &index, end);
if (nr_pages == 0)
break;
goto out;
}
- if (page->index > end)
- goto out;
-
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping)) {
unlock_page(page);
}
- /* The no. of pages is less than our desired, we are done. */
- if (nr_pages < num)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
+ /* There are no pages upto endoff - that would be a hole in there. */
if (whence == SEEK_HOLE && lastoff < endoff) {
found = 1;
*offset = lastoff;
inode_lock(inode);
isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset < 0 || offset >= isize) {
inode_unlock(inode);
return -ENXIO;
}
inode_lock(inode);
isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset < 0 || offset >= isize) {
inode_unlock(inode);
return -ENXIO;
}
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
int enospc = 0;
int iolock;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
return -EFBIG;
if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_AIO_NOWAIT;
+ file->f_mode |= FMODE_NOWAIT;
return 0;
}
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
-
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
-STATIC int
-xfs_filemap_page_mkwrite(
- struct vm_fault *vmf)
+static int
+__xfs_filemap_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
int ret;
- trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+ trace_xfs_filemap_fault(ip, pe_size, write_fault);
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ if (write_fault) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+ }
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
} else {
- ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
- ret = block_page_mkwrite_return(ret);
+ if (write_fault)
+ ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+ else
+ ret = filemap_fault(vmf);
}
-
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
+ if (write_fault)
+ sb_end_pagefault(inode->i_sb);
return ret;
}
-STATIC int
+static int
xfs_filemap_fault(
struct vm_fault *vmf)
{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- int ret;
-
- trace_xfs_filemap_fault(XFS_I(inode));
-
/* DAX can shortcut the normal fault path on write faults! */
- if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
- return xfs_filemap_page_mkwrite(vmf);
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- if (IS_DAX(inode))
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
- else
- ret = filemap_fault(vmf);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-
- return ret;
+ return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+ IS_DAX(file_inode(vmf->vma->vm_file)) &&
+ (vmf->flags & FAULT_FLAG_WRITE));
}
-/*
- * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
- * both read and write faults. Hence we need to handle both cases. There is no
- * ->huge_mkwrite callout for huge pages, so we have a single function here to
- * handle both cases here. @flags carries the information on the type of fault
- * occuring.
- */
-STATIC int
+static int
xfs_filemap_huge_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size)
{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct xfs_inode *ip = XFS_I(inode);
- int ret;
-
- if (!IS_DAX(inode))
+ if (!IS_DAX(file_inode(vmf->vma->vm_file)))
return VM_FAULT_FALLBACK;
- trace_xfs_filemap_huge_fault(ip);
-
- if (vmf->flags & FAULT_FLAG_WRITE) {
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- }
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-
- if (vmf->flags & FAULT_FLAG_WRITE)
- sb_end_pagefault(inode->i_sb);
+ /* DAX can shortcut the normal fault path on write faults! */
+ return __xfs_filemap_fault(vmf, pe_size,
+ (vmf->flags & FAULT_FLAG_WRITE));
+}
- return ret;
+static int
+xfs_filemap_page_mkwrite(
+ struct vm_fault *vmf)
+{
+ return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
/*
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
else if (IS_DAX(inode))
- ret = dax_pfn_mkwrite(vmf);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
+typedef __kernel_rwf_t rwf_t;
+
struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
- /* File is capable of returning -EAGAIN if AIO will block */
- #define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000)
+ /* File is capable of returning -EAGAIN if I/O will block */
+ #define FMODE_NOWAIT ((__force fmode_t)0x8000000)
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
atomic_t i_mmap_writable;/* count VM_SHARED mappings */
- struct rb_root i_mmap; /* tree of private and shared mappings */
+ struct rb_root_cached i_mmap; /* tree of private and shared mappings */
struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
unsigned long nrpages; /* number of total pages */
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
+ u8 bd_partno;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
*/
static inline int mapping_mapped(struct address_space *mapping)
{
- return !RB_EMPTY_ROOT(&mapping->i_mmap);
+ return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}
/*
unsigned char fl_type;
unsigned int fl_pid;
int fl_link_cpu; /* what cpu's list is this on? */
- struct pid *fl_nspid;
wait_queue_head_t fl_wait;
struct file *fl_file;
loff_t fl_start;
static inline struct dentry *file_dentry(const struct file *file)
{
- return d_real(file->f_path.dentry, file_inode(file), 0);
+ return d_real(file->f_path.dentry, file_inode(file), 0, 0);
}
static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
-struct mm_struct;
+/*
+ * sb->s_flags. Note that these mirror the equivalent MS_* flags where
+ * represented in both.
+ */
+#define SB_RDONLY 1 /* Mount read-only */
+#define SB_NOSUID 2 /* Ignore suid and sgid bits */
+#define SB_NODEV 4 /* Disallow access to device special files */
+#define SB_NOEXEC 8 /* Disallow program execution */
+#define SB_SYNCHRONOUS 16 /* Writes are synced at once */
+#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */
+#define SB_DIRSYNC 128 /* Directory modifications are synchronous */
+#define SB_NOATIME 1024 /* Do not update access times. */
+#define SB_NODIRATIME 2048 /* Do not update directory access times */
+#define SB_SILENT 32768
+#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
+#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
+#define SB_I_VERSION (1<<23) /* Update inode I_version field */
+#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define SB_SUBMOUNT (1<<26)
+#define SB_NOREMOTELOCK (1<<27)
+#define SB_NOSEC (1<<28)
+#define SB_BORN (1<<29)
+#define SB_ACTIVE (1<<30)
+#define SB_NOUSER (1<<31)
/*
* Umount options
struct iovec **ret_pointer);
extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
-extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- unsigned long, loff_t *, int);
-extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
- unsigned long, loff_t *, int);
+ unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
* possible to override it selectively if you really wanted to with some
* ioctl() that is not currently implemented.
*
- * Exception: MS_RDONLY is always applied to the entire file system.
+ * Exception: SB_RDONLY is always applied to the entire file system.
*
* Unfortunately, it is possible to change a filesystems flags with it mounted
* with files in use. This means that all of the inodes will not have their
*/
#define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg))
-#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
-#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
+static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & MS_RDONLY; }
+#define IS_RDONLY(inode) sb_rdonly((inode)->i_sb)
+#define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \
((inode)->i_flags & S_SYNC))
-#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
+#define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
-#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
-#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
-#define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION)
+#define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK)
+#define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
+#define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION)
#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
-#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
+#define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL)
#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
}
/*
- * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ * ... and these candidates should be on SB_MANDLOCK mounted fs,
* otherwise these will be advisory locks
*/
#endif
/* fs/char_dev.c */
-#define CHRDEV_MAJOR_HASH_SIZE 255
+#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
+/* Marks the top and bottom of the second segment of free char majors */
+#define CHRDEV_MAJOR_DYN_EXT_START 511
+#define CHRDEV_MAJOR_DYN_EXT_END 384
+
extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
#define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */
#ifdef CONFIG_BLOCK
-#define BLKDEV_MAJOR_HASH_SIZE 255
+#define BLKDEV_MAJOR_MAX 512
extern const char *__bdevname(dev_t, char *buffer);
extern const char *bdevname(struct block_device *bdev, char *buffer);
extern struct block_device *lookup_bdev(const char *);
extern void blkdev_show(struct seq_file *,off_t);
#else
-#define BLKDEV_MAJOR_HASH_SIZE 0
+#define BLKDEV_MAJOR_MAX 0
#endif
extern void init_special_inode(struct inode *, umode_t, dev_t);
extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
-extern int filemap_fdatawait(struct address_space *);
extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
+
+static inline int filemap_fdatawait(struct address_space *mapping)
+{
+ return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+}
+
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
loff_t lend);
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ loff_t lend);
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
extern int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
extern int filemap_check_errors(struct address_space *mapping);
-
extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
+static inline int file_write_and_wait(struct file *file)
+{
+ return file_write_and_wait_range(file, 0, LLONG_MAX);
+}
+
/**
* filemap_set_wb_err - set a writeback error on an address_space
* @mapping: mapping in which to set writeback error
* When a writeback error occurs, most filesystems will want to call
* filemap_set_wb_err to record the error in the mapping so that it will be
* automatically reported whenever fsync is called on the file.
- *
- * FIXME: mention FS_* flag here?
*/
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
return kernel_read_file_str[id];
}
-extern int kernel_read(struct file *, loff_t, char *, unsigned long);
extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
enum kernel_read_file_id);
extern int kernel_read_file_from_path(char *, void **, loff_t *, loff_t,
enum kernel_read_file_id);
extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
enum kernel_read_file_id);
-extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
+extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
+extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
/* fs/dcache.c -- generic fs support functions */
#endif
extern void unlock_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
+extern void evict_inodes(struct super_block *sb);
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- int flags);
+ rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- int flags);
+ rwf_t flags);
/* fs/block_dev.c */
extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
+static inline loff_t __inode_get_bytes(struct inode *inode)
+{
+ return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
static inline int vfs_fstatat(int dfd, const char __user *filename,
struct kstat *stat, int flags)
{
- return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- stat, STATX_BASIC_STATS);
+ return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
}
static inline int vfs_fstat(int fd, struct kstat *stat)
{
return res;
}
-static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
{
if (unlikely(flags & ~RWF_SUPPORTED))
return -EOPNOTSUPP;
if (flags & RWF_NOWAIT) {
- if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+ if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
return -EOPNOTSUPP;
ki->ki_flags |= IOCB_NOWAIT;
}
static inline void inode_has_no_xattr(struct inode *inode)
{
- if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC))
+ if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
inode->i_flags |= S_NOSEC;
}
return -EEXIST;
mapping->nrexceptional--;
- if (!dax_mapping(mapping)) {
- if (shadowp)
- *shadowp = p;
- } else {
- /* DAX can replace empty locked entry with a hole */
- WARN_ON_ONCE(p !=
- dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
- /* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index, p,
- true);
- }
+ if (shadowp)
+ *shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
workingset_update_node, mapping);
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- bool ret;
+ struct page *page;
if (end_byte < start_byte)
return false;
if (mapping->nrpages == 0)
return false;
- pagevec_init(&pvec, 0);
- if (!pagevec_lookup(&pvec, mapping, index, 1))
+ if (!find_get_pages_range(mapping, &index, end, 1, &page))
return false;
- ret = (pvec.pages[0]->index <= end);
- pagevec_release(&pvec);
- return ret;
+ put_page(page);
+ return true;
}
EXPORT_SYMBOL(filemap_range_has_page);
}
EXPORT_SYMBOL(filemap_fdatawait_range);
+/**
+ * file_fdatawait_range - wait for writeback to complete
+ * @file: file pointing to address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the address space that file
+ * refers to, in the given range and wait for all of them. Check error
+ * status of the address space vs. the file->f_wb_err cursor and return it.
+ *
+ * Since the error status of the file is advanced by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return file_check_and_advance_wb_err(file);
+}
+EXPORT_SYMBOL(file_fdatawait_range);
+
/**
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
* @mapping: address space structure to wait for
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
-
- if (i_size == 0)
- return 0;
-
- __filemap_fdatawait_range(mapping, 0, i_size - 1);
+ __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
-/**
- * filemap_fdatawait - wait for all under-writeback pages to complete
- * @mapping: address space structure to wait for
- *
- * Walk the list of under-writeback pages of the given address space
- * and wait for all of them. Check error status of the address space
- * and return it.
- *
- * Since the error status of the address space is cleared by this function,
- * callers are responsible for checking the return value and handling and/or
- * reporting the error.
- */
-int filemap_fdatawait(struct address_space *mapping)
+static bool mapping_needs_writeback(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
-
- if (i_size == 0)
- return 0;
-
- return filemap_fdatawait_range(mapping, 0, i_size - 1);
+ return (!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional);
}
-EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
{
int err = 0;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
void __filemap_set_wb_err(struct address_space *mapping, int err)
{
- errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+ errseq_t eseq = errseq_set(&mapping->wb_err, err);
trace_filemap_set_wb_err(mapping, eseq);
}
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
wait_queue_head_t *q = page_waitqueue(page);
struct wait_page_key key;
unsigned long flags;
+ wait_queue_entry_t bookmark;
key.page = page;
key.bit_nr = bit_nr;
key.page_match = 0;
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key(q, TASK_NORMAL, &key);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ /*
+ * Take a breather from holding the lock,
+ * allow pages that finish wake up asynchronously
+ * to acquire the lock and remove themselves
+ * from wait queue
+ */
+ spin_unlock_irqrestore(&q->lock, flags);
+ cpu_relax();
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ }
+
/*
* It is possible for other pages to have collided on the waitqueue
* hash, so in that case check for a page match. That prevents a long-
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, waiter);
+ __add_wait_queue_entry_tail(q, waiter);
SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
}
/**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index (inclusive)
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping. The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
+ * a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
*
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *start = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is page at index -1 but that is
+ * already broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
rcu_read_unlock();
+
return ret;
}
}
/**
- * do_generic_file_read - generic file read routine
- * @filp: the file to read
- * @ppos: current file position
+ * generic_file_buffered_read - generic file read routine
+ * @iocb: the iocb to read
* @iter: data destination
* @written: already copied
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
- static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
+ struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
+ loff_t *ppos = &iocb->ki_pos;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
page = find_get_page(mapping, index);
if (!page) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
index, last_index - index);
}
if (!PageUptodate(page)) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
goto readpage;
}
+ would_block:
+ error = -EAGAIN;
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_SHIFT;
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- ssize_t retval = 0;
size_t count = iov_iter_count(iter);
+ ssize_t retval = 0;
if (!count)
goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
+ struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t size;
goto out;
}
- retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
+ retval = generic_file_buffered_read(iocb, iter, retval);
out:
return retval;
}