mm/gup: introduce memfd_pin_folios() for pinning memfd folios

author Vivek Kasireddy <vivek.kasireddy@intel.com>

Mon, 24 Jun 2024 06:36:11 +0000 (23:36 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Fri, 12 Jul 2024 22:52:09 +0000 (15:52 -0700)
author Vivek Kasireddy <vivek.kasireddy@intel.com>
Mon, 24 Jun 2024 06:36:11 +0000 (23:36 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Fri, 12 Jul 2024 22:52:09 +0000 (15:52 -0700)
diff --git a/include/linux/memfd.h b/include/linux/memfd.h

index e7abf6f..3f2cf33 100644 (file)
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -6,11 +6,16 @@
  
  #ifdef CONFIG_MEMFD_CREATE
  extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
+struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
  #else
  static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
  {
         return -EINVAL;
  }
+static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
+{
+       return ERR_PTR(-EINVAL);
+}
  #endif
  
  #endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 7b84379..5f1075d 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2500,6 +2500,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                     struct page **pages, unsigned int gup_flags);
  long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                     struct page **pages, unsigned int gup_flags);
+long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
+                     struct folio **folios, unsigned int max_folios,
+                     pgoff_t *offset);
  
  int get_user_pages_fast(unsigned long start, int nr_pages,
                         unsigned int gup_flags, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c

index d98bb19..85d45ec 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -5,6 +5,7 @@
  #include <linux/spinlock.h>
  
  #include <linux/mm.h>
+#include <linux/memfd.h>
  #include <linux/memremap.h>
  #include <linux/pagemap.h>
  #include <linux/rmap.h>
@@ -17,6 +18,7 @@
  #include <linux/hugetlb.h>
  #include <linux/migrate.h>
  #include <linux/mm_inline.h>
+#include <linux/pagevec.h>
  #include <linux/sched/mm.h>
  #include <linux/shmem_fs.h>
  
@@ -3764,3 +3766,140 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                                      &locked, gup_flags);
  }
  EXPORT_SYMBOL(pin_user_pages_unlocked);
+
+/**
+ * memfd_pin_folios() - pin folios associated with a memfd
+ * @memfd:      the memfd whose folios are to be pinned
+ * @start:      the first memfd offset
+ * @end:        the last memfd offset (inclusive)
+ * @folios:     array that receives pointers to the folios pinned
+ * @max_folios: maximum number of entries in @folios
+ * @offset:     the offset into the first folio
+ *
+ * Attempt to pin folios associated with a memfd in the contiguous range
+ * [start, end]. Given that a memfd is either backed by shmem or hugetlb,
+ * the folios can either be found in the page cache or need to be allocated
+ * if necessary. Once the folios are located, they are all pinned via
+ * FOLL_PIN and @offset is populatedwith the offset into the first folio.
+ * And, eventually, these pinned folios must be released either using
+ * unpin_folios() or unpin_folio().
+ *
+ * It must be noted that the folios may be pinned for an indefinite amount
+ * of time. And, in most cases, the duration of time they may stay pinned
+ * would be controlled by the userspace. This behavior is effectively the
+ * same as using FOLL_LONGTERM with other GUP APIs.
+ *
+ * Returns number of folios pinned, which could be less than @max_folios
+ * as it depends on the folio sizes that cover the range [start, end].
+ * If no folios were pinned, it returns -errno.
+ */
+long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
+                     struct folio **folios, unsigned int max_folios,
+                     pgoff_t *offset)
+{
+       unsigned int flags, nr_folios, nr_found;
+       unsigned int i, pgshift = PAGE_SHIFT;
+       pgoff_t start_idx, end_idx, next_idx;
+       struct folio *folio = NULL;
+       struct folio_batch fbatch;
+       struct hstate *h;
+       long ret = -EINVAL;
+
+       if (start < 0 || start > end || !max_folios)
+               return -EINVAL;
+
+       if (!memfd)
+               return -EINVAL;
+
+       if (!shmem_file(memfd) && !is_file_hugepages(memfd))
+               return -EINVAL;
+
+       if (end >= i_size_read(file_inode(memfd)))
+               return -EINVAL;
+
+       if (is_file_hugepages(memfd)) {
+               h = hstate_file(memfd);
+               pgshift = huge_page_shift(h);
+       }
+
+       flags = memalloc_pin_save();
+       do {
+               nr_folios = 0;
+               start_idx = start >> pgshift;
+               end_idx = end >> pgshift;
+               if (is_file_hugepages(memfd)) {
+                       start_idx <<= huge_page_order(h);
+                       end_idx <<= huge_page_order(h);
+               }
+
+               folio_batch_init(&fbatch);
+               while (start_idx <= end_idx && nr_folios < max_folios) {
+                       /*
+                        * In most cases, we should be able to find the folios
+                        * in the page cache. If we cannot find them for some
+                        * reason, we try to allocate them and add them to the
+                        * page cache.
+                        */
+                       nr_found = filemap_get_folios_contig(memfd->f_mapping,
+                                                            &start_idx,
+                                                            end_idx,
+                                                            &fbatch);
+                       if (folio) {
+                               folio_put(folio);
+                               folio = NULL;
+                       }
+
+                       next_idx = 0;
+                       for (i = 0; i < nr_found; i++) {
+                               /*
+                                * As there can be multiple entries for a
+                                * given folio in the batch returned by
+                                * filemap_get_folios_contig(), the below
+                                * check is to ensure that we pin and return a
+                                * unique set of folios between start and end.
+                                */
+                               if (next_idx &&
+                                   next_idx != folio_index(fbatch.folios[i]))
+                                       continue;
+
+                               folio = page_folio(&fbatch.folios[i]->page);
+
+                               if (try_grab_folio(folio, 1, FOLL_PIN)) {
+                                       folio_batch_release(&fbatch);
+                                       ret = -EINVAL;
+                                       goto err;
+                               }
+
+                               if (nr_folios == 0)
+                                       *offset = offset_in_folio(folio, start);
+
+                               folios[nr_folios] = folio;
+                               next_idx = folio_next_index(folio);
+                               if (++nr_folios == max_folios)
+                                       break;
+                       }
+
+                       folio = NULL;
+                       folio_batch_release(&fbatch);
+                       if (!nr_found) {
+                               folio = memfd_alloc_folio(memfd, start_idx);
+                               if (IS_ERR(folio)) {
+                                       ret = PTR_ERR(folio);
+                                       if (ret != -EEXIST)
+                                               goto err;
+                               }
+                       }
+               }
+
+               ret = check_and_migrate_movable_folios(nr_folios, folios);
+       } while (ret == -EAGAIN);
+
+       memalloc_pin_restore(flags);
+       return ret ? ret : nr_folios;
+err:
+       memalloc_pin_restore(flags);
+       unpin_folios(folios, nr_folios);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(memfd_pin_folios);
diff --git a/mm/memfd.c b/mm/memfd.c

index 7d8d3ab..e7b7c52 100644 (file)
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -59,6 +59,51 @@ static void memfd_tag_pins(struct xa_state *xas)
         xas_unlock_irq(xas);
  }
  
+/*
+ * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
+ * It is mainly called to allocate a folio in a memfd when the caller
+ * (memfd_pin_folios()) cannot find a folio in the page cache at a given
+ * index in the mapping.
+ */
+struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       struct folio *folio;
+       gfp_t gfp_mask;
+       int err;
+
+       if (is_file_hugepages(memfd)) {
+               /*
+                * The folio would most likely be accessed by a DMA driver,
+                * therefore, we have zone memory constraints where we can
+                * alloc from. Also, the folio will be pinned for an indefinite
+                * amount of time, so it is not expected to be migrated away.
+                */
+               gfp_mask = htlb_alloc_mask(hstate_file(memfd));
+               gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
+
+               folio = alloc_hugetlb_folio_nodemask(hstate_file(memfd),
+                                                    numa_node_id(),
+                                                    NULL,
+                                                    gfp_mask,
+                                                    false);
+               if (folio && folio_try_get(folio)) {
+                       err = hugetlb_add_to_page_cache(folio,
+                                                       memfd->f_mapping,
+                                                       idx);
+                       if (err) {
+                               folio_put(folio);
+                               free_huge_folio(folio);
+                               return ERR_PTR(err);
+                       }
+                       return folio;
+               }
+               return ERR_PTR(-ENOMEM);
+       }
+#endif
+       return shmem_read_folio(memfd->f_mapping, idx);
+}
+
  /*
   * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
   * via get_user_pages(), drivers might have some pending I/O without any active
author	Vivek Kasireddy <vivek.kasireddy@intel.com>
	Mon, 24 Jun 2024 06:36:11 +0000 (23:36 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Fri, 12 Jul 2024 22:52:09 +0000 (15:52 -0700)
include/linux/memfd.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
mm/gup.c		patch \| blob \| history
mm/memfd.c		patch \| blob \| history