hugetlbfs: add hugetlbfs_fallocate()

author Mike Kravetz <mike.kravetz@oracle.com>

Tue, 8 Sep 2015 22:01:54 +0000 (15:01 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 8 Sep 2015 22:35:28 +0000 (15:35 -0700)
author Mike Kravetz <mike.kravetz@oracle.com>
Tue, 8 Sep 2015 22:01:54 +0000 (15:01 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Sep 2015 22:35:28 +0000 (15:35 -0700)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 1ef630f..316adb9 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
  #include <linux/thread_info.h>
  #include <asm/current.h>
  #include <linux/sched.h>               /* remove ASAP */
+#include <linux/falloc.h>
  #include <linux/fs.h>
  #include <linux/mount.h>
  #include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
         {Opt_err,       NULL},
  };
  
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+                                       struct inode *inode, pgoff_t index)
+{
+       vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+                                                       index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+       mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+                                       struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
  static void huge_pagevec_release(struct pagevec *pvec)
  {
         int i;
@@ -479,6 +503,158 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
         return 0;
  }
  
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct hstate *h = hstate_inode(inode);
+       loff_t hpage_size = huge_page_size(h);
+       loff_t hole_start, hole_end;
+
+       /*
+        * For hole punch round up the beginning offset of the hole and
+        * round down the end.
+        */
+       hole_start = round_up(offset, hpage_size);
+       hole_end = round_down(offset + len, hpage_size);
+
+       if (hole_end > hole_start) {
+               struct address_space *mapping = inode->i_mapping;
+
+               mutex_lock(&inode->i_mutex);
+               i_mmap_lock_write(mapping);
+               if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+                       hugetlb_vmdelete_list(&mapping->i_mmap,
+                                               hole_start >> PAGE_SHIFT,
+                                               hole_end  >> PAGE_SHIFT);
+               i_mmap_unlock_write(mapping);
+               remove_inode_hugepages(inode, hole_start, hole_end);
+               mutex_unlock(&inode->i_mutex);
+       }
+
+       return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+                               loff_t len)
+{
+       struct inode *inode = file_inode(file);
+       struct address_space *mapping = inode->i_mapping;
+       struct hstate *h = hstate_inode(inode);
+       struct vm_area_struct pseudo_vma;
+       struct mm_struct *mm = current->mm;
+       loff_t hpage_size = huge_page_size(h);
+       unsigned long hpage_shift = huge_page_shift(h);
+       pgoff_t start, index, end;
+       int error;
+       u32 hash;
+
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
+
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               return hugetlbfs_punch_hole(inode, offset, len);
+
+       /*
+        * Default preallocate case.
+        * For this range, start is rounded down and end is rounded up
+        * as well as being converted to page offsets.
+        */
+       start = offset >> hpage_shift;
+       end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+       mutex_lock(&inode->i_mutex);
+
+       /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+       error = inode_newsize_ok(inode, offset + len);
+       if (error)
+               goto out;
+
+       /*
+        * Initialize a pseudo vma as this is required by the huge page
+        * allocation routines.  If NUMA is configured, use page index
+        * as input to create an allocation policy.
+        */
+       memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+       pseudo_vma.vm_file = file;
+
+       for (index = start; index < end; index++) {
+               /*
+                * This is supposed to be the vaddr where the page is being
+                * faulted in, but we have no vaddr here.
+                */
+               struct page *page;
+               unsigned long addr;
+               int avoid_reserve = 0;
+
+               cond_resched();
+
+               /*
+                * fallocate(2) manpage permits EINTR; we may have been
+                * interrupted because we are using up too much memory.
+                */
+               if (signal_pending(current)) {
+                       error = -EINTR;
+                       break;
+               }
+
+               /* Set numa allocation policy based on index */
+               hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+               /* addr is the offset within the file (zero based) */
+               addr = index * hpage_size;
+
+               /* mutex taken here, fault path and hole punch */
+               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+                                               index, addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+               /* See if already present in mapping to avoid alloc/free */
+               page = find_get_page(mapping, index);
+               if (page) {
+                       put_page(page);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       hugetlb_drop_vma_policy(&pseudo_vma);
+                       continue;
+               }
+
+               /* Allocate page and add to page cache */
+               page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+               hugetlb_drop_vma_policy(&pseudo_vma);
+               if (IS_ERR(page)) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       error = PTR_ERR(page);
+                       goto out;
+               }
+               clear_huge_page(page, addr, pages_per_huge_page(h));
+               __SetPageUptodate(page);
+               error = huge_add_to_page_cache(page, mapping, index);
+               if (unlikely(error)) {
+                       put_page(page);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out;
+               }
+
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+               /*
+                * page_put due to reference from alloc_huge_page()
+                * unlock_page because locked by add_to_page_cache()
+                */
+               put_page(page);
+               unlock_page(page);
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+               i_size_write(inode, offset + len);
+       inode->i_ctime = CURRENT_TIME;
+       spin_lock(&inode->i_lock);
+       inode->i_private = NULL;
+       spin_unlock(&inode->i_lock);
+out:
+       mutex_unlock(&inode->i_mutex);
+       return error;
+}
+
  static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
  {
         struct inode *inode = d_inode(dentry);
@@ -790,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
         .mmap                   = hugetlbfs_file_mmap,
         .fsync                  = noop_fsync,
         .get_unmapped_area      = hugetlb_get_unmapped_area,
-       .llseek         = default_llseek,
+       .llseek                 = default_llseek,
+       .fallocate              = hugetlbfs_fallocate,
  };
  
  static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 1222fb0..5e35379 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -330,6 +330,8 @@ struct huge_bootmem_page {
  #endif
  };
  
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+                               unsigned long addr, int avoid_reserve);
  struct page *alloc_huge_page_node(struct hstate *h, int nid);
  struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
                                 unsigned long addr, int avoid_reserve);
@@ -483,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
  
  #else  /* CONFIG_HUGETLB_PAGE */
  struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
  #define alloc_huge_page_node(h, nid) NULL
  #define alloc_huge_page_noerr(v, a, r) NULL
  #define alloc_bootmem_huge_page(h) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index d45eacc..cd1280c 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1727,7 +1727,7 @@ static void vma_end_reservation(struct hstate *h,
         (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
  }
  
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
                                     unsigned long addr, int avoid_reserve)
  {
         struct hugepage_subpool *spool = subpool_vma(vma);
author	Mike Kravetz <mike.kravetz@oracle.com>
	Tue, 8 Sep 2015 22:01:54 +0000 (15:01 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 8 Sep 2015 22:35:28 +0000 (15:35 -0700)
fs/hugetlbfs/inode.c		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history