unsigned int page_mask;
};
+/*
+ * Return the compound head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
/**
* put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
spinlock_t *ptl;
pte_t *ptep, pte;
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return ERR_PTR(-EINVAL);
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
+ if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pud(mm, address, pud, flags);
if (page)
return page;
* Or NULL if the caller does not require them.
* @nonblocking: whether waiting for disk IO or mmap_sem contention
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_sem is held.
*
* Must be called with mmap_sem held. It may be released. See below.
*
start = untagged_addr(start);
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+ VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
BUG_ON(*locked != 1);
}
- if (pages)
+ /*
+ * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
+ * is to set FOLL_GET if the caller wants pages[] filled in (but has
+ * carelessly failed to specify FOLL_GET), so keep doing that, but only
+ * for FOLL_GET, not for the newer FOLL_PIN.
+ *
+ * FOLL_PIN always expects pages to be non-null, but no need to assert
+ * that here, as any failures will be obvious enough.
+ */
+ if (pages && !(flags & FOLL_PIN))
flags |= FOLL_GET;
pages_done = 0;
return pages_done;
}
-/*
- * get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
- * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
- * be called after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- *
- * get_user_pages should be phased out in favor of
- * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
- * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
- */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
-}
-EXPORT_SYMBOL(get_user_pages_remote);
-
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
bool drain_allow = true;
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
+ long ret = nr_pages;
check_again:
for (i = 0; i < nr_pages;) {
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
- if ((nr_pages > 0) && migrate_allow) {
+ if ((ret > 0) && migrate_allow) {
+ nr_pages = ret;
drain_allow = true;
goto check_again;
}
}
- return nr_pages;
+ return ret;
}
#else
static long check_and_migrate_cma_pages(struct task_struct *tsk,
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+/*
+ * get_user_pages_remote() - pin user pages in memory
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
+ * be called after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
+ */
+#ifdef CONFIG_MMU
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ /*
+ * Parts of FOLL_LONGTERM behavior are incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. However, this only comes up if locked is set, and there are
+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
+ * allow what we can.
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ if (WARN_ON_ONCE(locked))
+ return -EINVAL;
+ /*
+ * This will check the vmas (even if our vmas arg is NULL)
+ * and return -ENOTSUPP if DAX isn't allowed in this case:
+ */
+ return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ vmas, gup_flags | FOLL_TOUCH |
+ FOLL_REMOTE);
+ }
+
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+#else /* CONFIG_MMU */
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ return 0;
+}
+#endif /* !CONFIG_MMU */
+
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
return __gup_longterm_locked(current, current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
}
}
-/*
- * Return the compund head page with ref appropriately incremented,
- * or NULL if that failed.
- */
-static inline struct page *try_get_compound_head(struct page *page, int refs)
-{
- struct page *head = compound_head(page);
- if (WARN_ON_ONCE(page_ref_count(head) < 0))
- return NULL;
- if (unlikely(!page_cache_add_speculative(head, refs)))
- return NULL;
- return head;
-}
-
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
}
#endif
+static int record_subpages(struct page *page, unsigned long addr,
+ unsigned long end, struct page **pages)
+{
+ int nr;
+
+ for (nr = 0; addr != end; addr += PAGE_SIZE)
+ pages[nr++] = page++;
+
+ return nr;
+}
+
+static void put_compound_head(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- refs = 0;
head = pte_page(pte);
-
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(head, refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
}
- refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pmd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
}
- refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pud_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
- refs = 0;
+
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pgd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
pud_t pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
- if (pud_none(pud))
+ if (unlikely(!pud_present(pud)))
return 0;
if (unlikely(pud_huge(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
return ret;
}
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
+static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags,
+ struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
- if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+ FOLL_FORCE | FOLL_PIN)))
return -EINVAL;
start = untagged_addr(start) & PAGE_MASK;
return ret;
}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number requested.
+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
+ * -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/**
+ * pin_user_pages_fast() - pin user pages in memory without taking locks
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages_fast().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+int pin_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+
+/**
+ * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages_remote().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
+ vmas, locked);
+}
+EXPORT_SYMBOL(pin_user_pages_remote);
+
+/**
+ * pin_user_pages() - pin user pages in memory for use by other devices
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+long pin_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+EXPORT_SYMBOL(pin_user_pages);