p = buf;
while (bytes_left >= sizeof(*p)) {
info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE);
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE);
+ info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
/* ipc tcons are not refcounted */
spin_lock(&cifs_tcp_ses_lock);
tcon->tc_count--;
+ /* tc_count can never go negative */
+ WARN_ON(tcon->tc_count < 0);
spin_unlock(&cifs_tcp_ses_lock);
}
kfree(utf16_path);
return rc;
}
+ filemap_invalidate_lock(inode->i_mapping);
/*
* We implement the punch hole through ioctl, so we need remove the page
* caches first, otherwise the data may be inconsistent with the server.
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
free_xid(xid);
+ filemap_invalidate_unlock(inode->i_mapping);
return rc;
}
{
struct cifs_io_parms io_parms = {0};
int nbytes;
+ int rc = 0;
struct kvec iov[2];
io_parms.netfid = cfile->fid.netfid;
io_parms.tcon = tcon;
io_parms.persistent_fid = cfile->fid.persistent_fid;
io_parms.volatile_fid = cfile->fid.volatile_fid;
- io_parms.offset = off;
- io_parms.length = len;
- /* iov[0] is reserved for smb header */
- iov[1].iov_base = buf;
- iov[1].iov_len = io_parms.length;
- return SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+ while (len) {
+ io_parms.offset = off;
+ io_parms.length = len;
+ if (io_parms.length > SMB2_MAX_BUFFER_SIZE)
+ io_parms.length = SMB2_MAX_BUFFER_SIZE;
+ /* iov[0] is reserved for smb header */
+ iov[1].iov_base = buf;
+ iov[1].iov_len = io_parms.length;
+ rc = SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+ if (rc)
+ break;
+ if (nbytes > len)
+ return -EINVAL;
+ buf += nbytes;
+ off += nbytes;
+ len -= nbytes;
+ }
+ return rc;
}
static int smb3_simple_fallocate_range(unsigned int xid,
(char **)&out_data, &out_data_len);
if (rc)
goto out;
- /*
- * It is already all allocated
- */
- if (out_data_len == 0)
- goto out;
buf = kzalloc(1024 * 1024, GFP_KERNEL);
if (buf == NULL) {
goto out;
}
+ if (keep_size == true) {
+ /*
+ * We can not preallocate pages beyond the end of the file
+ * in SMB2
+ */
+ if (off >= i_size_read(inode)) {
+ rc = 0;
+ goto out;
+ }
+ /*
+ * For fallocates that are partially beyond the end of file,
+ * clamp len so we only fallocate up to the end of file.
+ */
+ if (off + len > i_size_read(inode)) {
+ len = i_size_read(inode) - off;
+ }
+ }
+
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
/*
* At this point, we are trying to fallocate an internal
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
- #ifdef CONFIG_FS_DAX
- struct rw_semaphore dax_sem;
- #endif
/*
* truncate_mutex is for serialising ext2_truncate() against
#endif
};
- #ifdef CONFIG_FS_DAX
- #define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
- #define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
- #else
- #define dax_sem_down_write(ext2_inode)
- #define dax_sem_up_write(ext2_inode)
- #endif
-
/*
* Inode dynamic state flags
*/
extern int ext2_make_empty(struct inode *, struct inode *);
extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
struct page **, void **res_page_addr);
-extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page,
+ char *kaddr);
extern int ext2_empty_dir (struct inode *);
extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa);
extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *,
}
-#ifdef CONFIG_FS_DAX
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
.iomap_begin = ext2_iomap_begin,
.iomap_end = ext2_iomap_end,
};
-#else
-/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
-const struct iomap_ops ext2_iomap_ops;
-#endif /* CONFIG_FS_DAX */
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext2_get_block);
+ int ret;
+
+ inode_lock(inode);
+ len = min_t(u64, len, i_size_read(inode));
+ ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops);
+ inode_unlock(inode);
+
+ return ret;
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
ext2_free_data(inode, p, q);
}
- /* dax_sem must be held when calling this function */
+ /* mapping->invalidate_lock must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
#ifdef CONFIG_FS_DAX
- WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+ WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
#endif
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (ext2_inode_is_fast_symlink(inode))
return;
- dax_sem_down_write(EXT2_I(inode));
+ filemap_invalidate_lock(inode->i_mapping);
__ext2_truncate_blocks(inode, offset);
- dax_sem_up_write(EXT2_I(inode));
+ filemap_invalidate_unlock(inode->i_mapping);
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
- dax_sem_down_write(EXT2_I(inode));
+ filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
- dax_sem_up_write(EXT2_I(inode));
+ filemap_invalidate_unlock(inode->i_mapping);
inode->i_mtime = inode->i_ctime = current_time(inode);
if (inode_needs_sync(inode)) {
/*
* Can't do inline reclaim in fault path. We call
* dax_layout_busy_page() before we free a range. And
- * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
- * In fault path we enter with fi->i_mmap_sem held and can't drop
- * it. Also in fault path we hold fi->i_mmap_sem shared and not
- * exclusive, so that creates further issues with fuse_wait_dax_page().
- * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
- * range to become free and retry.
+ * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
+ * In fault path we enter with mapping->invalidate_lock held and can't
+ * drop it. Also in fault path we hold mapping->invalidate_lock shared
+ * and not exclusive, so that creates further issues with
+ * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
+ * will wait for a memory range to become free and retry.
*/
if (flags & IOMAP_FAULT) {
alloc_dmap = alloc_dax_mapping(fcd);
down_write(&fi->dax->sem);
node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
- /* We are holding either inode lock or i_mmap_sem, and that should
+ /* We are holding either inode lock or invalidate_lock, and that should
* ensure that dmap can't be truncated. We are holding a reference
* on dmap and that should make sure it can't be reclaimed. So dmap
* should still be there in tree despite the fact we dropped and
static void fuse_wait_dax_page(struct inode *inode)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
schedule();
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
}
- /* Should be called with fi->i_mmap_sem lock held exclusively */
+ /* Should be called with mapping->invalidate_lock held exclusively */
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
loff_t start, loff_t end)
{
* we do not want any read/write/mmap to make progress and try
* to populate page cache or access memory we are trying to free.
*/
- down_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
error = 0;
retry = true;
- up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
goto retry;
}
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
sb_end_pagefault(sb);
int ret;
struct interval_tree_node *node;
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
/* Lookup a dmap and corresponding file offset to reclaim. */
down_read(&fi->dax->sem);
out_write_dmap_sem:
up_write(&fi->dax->sem);
out_mmap_sem:
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return dmap;
}
* had a reference or some other temporary failure,
* Try again. We want to give up inline reclaim only
* if there is no range assigned to this node. Otherwise
- * if a deadlock is possible if we sleep with fi->i_mmap_sem
- * held and worker to free memory can't make progress due
- * to unavailability of fi->i_mmap_sem lock. So sleep
- * only if fi->dax->nr=0
+ * if a deadlock is possible if we sleep with
+ * mapping->invalidate_lock held and worker to free memory
+ * can't make progress due to unavailability of
+ * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
*/
if (retry)
continue;
* There are no mappings which can be reclaimed. Wait for one.
* We are not holding fi->dax->sem. So it is possible
* that range gets added now. But as we are not holding
- * fi->i_mmap_sem, worker should still be able to free up
- * a range and wake us up.
+ * mapping->invalidate_lock, worker should still be able to
+ * free up a range and wake us up.
*/
if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
if (wait_event_killable_exclusive(fcd->range_waitq,
/*
* Free a range of memory.
* Locking:
- * 1. Take fi->i_mmap_sem to block dax faults.
+ * 1. Take mapping->invalidate_lock to block dax faults.
* 2. Take fi->dax->sem to protect interval tree and also to make sure
* read/write can not reuse a dmap which we might be freeing.
*/
loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
if (ret) {
pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
up_write(&fi->dax->sem);
out_mmap_sem:
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return ret;
}
static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
{
long nr_pages, nr_ranges;
- void *kaddr;
- pfn_t pfn;
struct fuse_dax_mapping *range;
int ret, id;
size_t dax_size = -1;
INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
id = dax_read_lock();
- nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr,
- &pfn);
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL,
+ NULL);
dax_read_unlock(id);
if (nr_pages < 0) {
pr_debug("dax_direct_access() returned %ld\n", nr_pages);
/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
*
* The 3 locks should always be ordered so that the IO lock is obtained first,
*
* Basic locking order:
*
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
*
* mmap_lock locking order:
*
* i_rwsem -> page lock -> mmap_lock
- * mmap_lock -> i_mmap_lock -> page_lock
+ * mmap_lock -> invalidate_lock -> page_lock
*
* The difference in mmap_lock locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_lock.
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
*/
XFS_IOLOCK_DEP(lock_flags));
}
- if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_MMAPLOCK_EXCL) {
+ down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+ down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ }
if (lock_flags & XFS_ILOCK_EXCL)
mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
}
if (lock_flags & XFS_MMAPLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_mmaplock))
+ if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_mmaplock))
+ if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
}
out_undo_mmaplock:
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
up_read(&VFS_I(ip)->i_rwsem);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
mrunlock_excl(&ip->i_lock);
if (lock_flags & XFS_ILOCK_EXCL)
mrdemote(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrdemote(&ip->i_mmaplock);
+ downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_rwsem);
}
#if defined(DEBUG) || defined(XFS_WARN)
- int
+ static inline bool
+ __xfs_rwsem_islocked(
+ struct rw_semaphore *rwsem,
+ bool shared)
+ {
+ if (!debug_locks)
+ return rwsem_is_locked(rwsem);
+
+ if (!shared)
+ return lockdep_is_held_type(rwsem, 0);
+
+ /*
+ * We are checking that the lock is held at least in shared
+ * mode but don't care that it might be held exclusively
+ * (i.e. shared | excl). Hence we check if the lock is held
+ * in any mode rather than an explicit shared mode.
+ */
+ return lockdep_is_held_type(rwsem, -1);
+ }
+
+ bool
xfs_isilocked(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
uint lock_flags)
{
if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- if (!(lock_flags & XFS_MMAPLOCK_SHARED))
- return !!ip->i_mmaplock.mr_writer;
- return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
- if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
- if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !debug_locks ||
- lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
- return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
+ if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
ASSERT(0);
- return 0;
+ return false;
}
#endif
}
/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * the mmaplock or the ilock, but not more than one type at a time. If we lock
- * more than one at a time, lockdep will report false positives saying we have
- * violated locking orders. The iolock must be double-locked separately since
- * we use i_rwsem for that. We now support taking one lock EXCL and the other
- * SHARED.
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
*/
void
xfs_lock_two_inodes(
ASSERT(hweight32(ip1_mode) == 1);
ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-
+ ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+ ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
error = xfs_droplink(tp, ip);
if (error)
goto out_trans_cancel;
+
+ /*
+ * Point the unlinked child directory's ".." entry to the root
+ * directory to eliminate back-references to inodes that may
+ * get freed before the child directory is closed. If the fs
+ * gets shrunk, this can lead to dirent inode validation errors.
+ */
+ if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+ error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+ tp->t_mountp->m_sb.sb_rootino, 0);
+ if (error)
+ return error;
+ }
} else {
/*
* When removing a non-directory we need to log the parent
ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
if (ret)
return ret;
- if (ip1 == ip2)
- xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
- ip2, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
return 0;
}
struct xfs_inode *ip1,
struct xfs_inode *ip2)
{
- bool same_inode = (ip1 == ip2);
-
- xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
inode_unlock(VFS_I(ip2));
- if (!same_inode)
+ if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
inode_dio_wait(inode);
/* Serialize against page faults */
- down_write(&zi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
/* Serialize against zonefs_iomap_begin() */
mutex_lock(&zi->i_truncate_mutex);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
- up_write(&zi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return ret;
}
return ret;
}
- static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf)
- {
- struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file));
- vm_fault_t ret;
-
- down_read(&zi->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&zi->i_mmap_sem);
-
- return ret;
- }
-
static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
file_update_time(vmf->vma->vm_file);
/* Serialize against truncates */
- down_read(&zi->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
- up_read(&zi->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
return ret;
}
static const struct vm_operations_struct zonefs_file_vm_ops = {
- .fault = zonefs_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = zonefs_filemap_page_mkwrite,
};
return 0;
bio = bio_alloc(GFP_NOFS, nr_pages);
- if (!bio)
- return -ENOMEM;
-
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = zi->i_zsector;
bio->bi_write_hint = iocb->ki_hint;
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
- init_rwsem(&zi->i_mmap_sem);
zi->i_wr_refcnt = 0;
return &zi->i_vnode;
* struct address_space - Contents of a cacheable, mappable object.
* @host: Owner, either the inode or the block_device.
* @i_pages: Cached pages.
+ * @invalidate_lock: Guards coherency between page cache contents and
+ * file offset->disk block mappings in the filesystem during invalidates.
+ * It is also used to block modification of page cache contents through
+ * memory mappings.
* @gfp_mask: Memory allocation flags to use for allocating pages.
* @i_mmap_writable: Number of VM_SHARED mappings.
* @nr_thps: Number of THPs in the pagecache (non-shmem only).
struct address_space {
struct inode *host;
struct xarray i_pages;
+ struct rw_semaphore invalidate_lock;
gfp_t gfp_mask;
atomic_t i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
down_read_nested(&inode->i_rwsem, subclass);
}
+ static inline void filemap_invalidate_lock(struct address_space *mapping)
+ {
+ down_write(&mapping->invalidate_lock);
+ }
+
+ static inline void filemap_invalidate_unlock(struct address_space *mapping)
+ {
+ up_write(&mapping->invalidate_lock);
+ }
+
+ static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+ {
+ down_read(&mapping->invalidate_lock);
+ }
+
+ static inline int filemap_invalidate_trylock_shared(
+ struct address_space *mapping)
+ {
+ return down_read_trylock(&mapping->invalidate_lock);
+ }
+
+ static inline void filemap_invalidate_unlock_shared(
+ struct address_space *mapping)
+ {
+ up_read(&mapping->invalidate_lock);
+ }
+
void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);
+ void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2);
+ void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2);
+
+
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
- /* Pending fsnotify inode refs */
- atomic_long_t s_fsnotify_inode_refs;
+ /*
+ * Number of inode/mount/sb objects that are being watched, note that
+ * inodes objects are currently double-accounted.
+ */
+ atomic_long_t s_fsnotify_connectors;
/* Being remounted read-only */
int s_readonly_remount;
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
+ struct lock_class_key invalidate_lock_key;
struct lock_class_key i_mutex_dir_key;
};
switch (pages) {
case -EINTR:
return -EINTR;
- case -EFAULT: /* Incompatible mappings / permissions. */
+ case -EINVAL: /* Incompatible mappings / permissions. */
return -EINVAL;
case -EHWPOISON:
return -EHWPOISON;
+ case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
+ return -EFAULT;
default:
pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages);
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/*
- * Filesystem's fallocate may need to take i_mutex. We need to
+ * Filesystem's fallocate may need to take i_rwsem. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
/*
* Truncation is a bit tricky. Enable it per file system for now.
*
- * Open: to take i_mutex or not for this? Right now we don't.
+ * Open: to take i_rwsem or not for this? Right now we don't.
*/
ret = truncate_error_page(p, pfn, mapping);
out:
* unexpected races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head))
- return 0;
+ return -EBUSY;
if (PageTransHuge(head)) {
/*
}
goto out;
} else if (ret == -EBUSY) {
- /* We raced with freeing huge page to buddy, retry. */
- if (pass++ < 3)
+ /*
+ * We raced with (possibly temporary) unhandlable
+ * page, retry.
+ */
+ if (pass++ < 3) {
+ shake_page(p, 1);
goto try_again;
+ }
+ ret = -EIO;
goto out;
}
}
/*
* Lock ordering in mm:
*
- * inode->i_mutex (while writing or truncating, not reading or faulting)
+ * inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
- * page->flags PG_locked (lock_page) * (see huegtlbfs below)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- * mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * lock_page_memcg move_lock (in __set_page_dirty_buffers)
- * i_pages lock (widely used)
- * lruvec->lru_lock (in lock_page_lruvec_irq)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * i_pages lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * mapping->invalidate_lock (in filemap_fault)
+ * page->flags PG_locked (lock_page) * (see hugetlbfs below)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * lock_page_memcg move_lock (in __set_page_dirty_buffers)
+ * i_pages lock (widely used)
+ * lruvec->lru_lock (in lock_page_lruvec_irq)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * i_pages lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*
/*
* If the page is mlock()d, we cannot swap it out.
*/
- if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED) {
- /* PTE-mapped THP are never marked as mlocked */
- if (!PageTransCompound(page) ||
- (PageHead(page) && !PageDoubleMap(page))) {
- /*
- * Holding pte lock, we do *not* need
- * mmap_lock here
- */
- mlock_vma_page(page);
- }
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ if (!(flags & TTU_IGNORE_MLOCK) &&
+ (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * PTE-mapped THP are never marked as mlocked: so do
+ * not set it on a DoubleMap THP, nor on an Anon THP
+ * (which may still be PTE-mapped after DoubleMap was
+ * cleared). But stop unmapping even in those cases.
+ */
+ if (!PageTransCompound(page) || (PageHead(page) &&
+ !PageDoubleMap(page) && !PageAnon(page)))
+ mlock_vma_page(page);
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
}
/* Unexpected PMD-mapped THP? */
*/
if (vma->vm_flags & VM_LOCKED) {
/*
- * PTE-mapped THP are never marked as mlocked, but
- * this function is never called when PageDoubleMap().
+ * PTE-mapped THP are never marked as mlocked; but
+ * this function is never called on a DoubleMap THP,
+ * nor on an Anon THP (which may still be PTE-mapped
+ * after DoubleMap was cleared).
*/
mlock_vma_page(page);
/*
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+ /* Anon THP are only marked as mlocked when singly mapped */
+ if (PageTransCompound(page) && PageAnon(page))
+ return;
+
rmap_walk(page, &rwc);
}
/*
* shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * inode->i_private (with i_mutex making sure that it has only one user at
+ * inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given offsets are swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given vma is swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
- struct swap_info_struct *si;
- struct page *page = NULL;
+ struct page *page;
swp_entry_t swap;
int error;
swap = radix_to_swp_entry(*pagep);
*pagep = NULL;
- /* Prevent swapoff from happening to us. */
- si = get_swap_device(swap);
- if (!si) {
- error = EINVAL;
- goto failed;
- }
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
swap_free(swap);
*pagep = page;
- if (si)
- put_swap_device(si);
return 0;
failed:
if (!shmem_confirm_swap(mapping, index, swap))
put_page(page);
}
- if (si)
- put_swap_device(si);
-
return error;
}
/*
* Trinity finds that probing a hole which tmpfs is punching can
* prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_mutex. So refrain from
+ * locks writers out with its hold on i_rwsem. So refrain from
* faulting pages into the hole while it's being punched. Although
* shmem_undo_range() does remove the additions, it may be unable to
* keep up, as each new page needs its own unmap_mapping_range() call,
* we just need to make racing faults a rare case.
*
* The implementation below would be much simpler if we just used a
- * standard mutex or completion: but we cannot take i_mutex in fault,
+ * standard mutex or completion: but we cannot take i_rwsem in fault,
* and bloating every shmem inode for this unlikely case would be sad.
*/
if (unlikely(inode->i_private)) {
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
- /* i_mutex is held by caller */
+ /* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
/*
* We must evaluate after, since reads (unlike writes)
- * are called without i_mutex protection against truncate
+ * are called without i_rwsem protection against truncate
*/
nr = PAGE_SIZE;
i_size = i_size_read(inode);
return -ENXIO;
inode_lock(inode);
- /* We're holding i_mutex so we can access i_size directly */
+ /* We're holding i_rwsem so we can access i_size directly */
offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;