#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
-#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"
resv->region_cache_count;
/* At this point, we should have enough entries in the cache
- * for all the existings adds_in_progress. We should only be
+ * for all the existing adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
- bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+ bool pin = !!(current->flags & PF_MEMALLOC_PIN);
lockdep_assert_held(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (nocma && is_migrate_cma_page(page))
+ if (pin && !is_pinnable_page(page))
continue;
if (PageHWPoison(page))
}
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+/*
+ * Must be called with the hugetlb lock held
+ */
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
+{
+ lockdep_assert_held(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+}
+
+static void __prep_new_huge_page(struct page *page)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ __prep_new_huge_page(page);
spin_lock_irq(&hugetlb_lock);
- h->nr_huge_pages++;
- h->nr_huge_pages_node[nid]++;
+ __prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
}
}
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+ struct list_head *list)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nid = page_to_nid(old_page);
+ struct page *new_page;
+ int ret = 0;
+
+ /*
+ * Before dissolving the page, we need to allocate a new one for the
+ * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+ * not having to deal with prep_new_huge_page() and avoids dealing of any
+ * counters. This simplifies and let us do the whole thing under the
+ * lock.
+ */
+ new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+ if (!new_page)
+ return -ENOMEM;
+
+retry:
+ spin_lock_irq(&hugetlb_lock);
+ if (!PageHuge(old_page)) {
+ /*
+ * Freed from under us. Drop new_page too.
+ */
+ goto free_new;
+ } else if (page_count(old_page)) {
+ /*
+ * Someone has grabbed the page, try to isolate it here.
+ * Fail with -EBUSY if not possible.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ if (!isolate_huge_page(old_page, list))
+ ret = -EBUSY;
+ spin_lock_irq(&hugetlb_lock);
+ goto free_new;
+ } else if (!HPageFreed(old_page)) {
+ /*
+ * Page's refcount is 0 but it has not been enqueued in the
+ * freelist yet. Race window is small, so we can succeed here if
+ * we retry.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ cond_resched();
+ goto retry;
+ } else {
+ /*
+ * Ok, old_page is still a genuine free hugepage. Remove it from
+ * the freelist and decrease the counters. These will be
+ * incremented again when calling __prep_account_new_huge_page()
+ * and enqueue_huge_page() for new_page. The counters will remain
+ * stable since this happens under the lock.
+ */
+ remove_hugetlb_page(h, old_page, false);
+
+ /*
+ * new_page needs to be initialized with the standard hugetlb
+ * state. This is normally done by prep_new_huge_page() but
+ * that takes hugetlb_lock which is already held so we need to
+ * open code it here.
+ * Reference count trick is needed because allocator gives us
+ * referenced page but the pool requires pages with 0 refcount.
+ */
+ __prep_new_huge_page(new_page);
+ __prep_account_new_huge_page(h, nid);
+ page_ref_dec(new_page);
+ enqueue_huge_page(h, new_page);
+
+ /*
+ * Pages have been replaced, we can safely free the old one.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_page(h, old_page);
+ }
+
+ return ret;
+
+free_new:
+ spin_unlock_irq(&hugetlb_lock);
+ __free_pages(new_page, huge_page_order(h));
+
+ return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+ struct hstate *h;
+ struct page *head;
+ int ret = -EBUSY;
+
+ /*
+ * The page might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ * Return success when racing as if we dissolved the page ourselves.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHuge(page)) {
+ head = compound_head(page);
+ h = page_hstate(head);
+ } else {
+ spin_unlock_irq(&hugetlb_lock);
+ return 0;
+ }
+ spin_unlock_irq(&hugetlb_lock);
+
+ /*
+ * Fence off gigantic pages as there is a cyclic dependency between
+ * alloc_contig_range and them. Return -ENOMEM as this has the effect
+ * of bailing out right away without further retrying.
+ */
+ if (hstate_is_gigantic(h))
+ return -ENOMEM;
+
+ if (page_count(head) && isolate_huge_page(head, list))
+ ret = 0;
+ else if (!page_count(head))
+ ret = alloc_and_dissolve_huge_page(h, head, list);
+
+ return ret;
+}
+
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
* See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_pte_wrprotect(entry);
}
page_dup_rmap(ptepage, true);
return 0;
}
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx,
+ unsigned int flags,
+ unsigned long haddr,
+ unsigned long reason)
+{
+ vm_fault_t ret;
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .flags = flags,
+
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ ret = handle_userfault(&vmf, reason);
+ i_mmap_lock_read(mapping);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ return ret;
+}
+
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- /*
- * Check for page in userfault range
- */
+ /* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
- u32 hash;
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .flags = flags,
- /*
- * Hard to debug if it ends up being
- * used by a callee that assumes
- * something about the other
- * uninitialized fields... same as in
- * memory.c
- */
- };
-
- /*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
- */
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
+ flags, haddr,
+ VM_UFFD_MISSING);
goto out;
}
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
+
+ /* Check for page in userfault range. */
+ if (userfaultfd_minor(vma)) {
+ unlock_page(page);
+ put_page(page);
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
+ flags, haddr,
+ VM_UFFD_MINOR);
+ goto out;
+ }
}
/*
return ret;
}
+#ifdef CONFIG_USERFAULTFD
/*
* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
* modifications for huge pages.
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
struct page **pagep)
{
+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct address_space *mapping;
pgoff_t idx;
unsigned long size;
spinlock_t *ptl;
int ret;
struct page *page;
+ int writable;
- if (!*pagep) {
+ mapping = dst_vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ if (is_continue) {
+ ret = -EFAULT;
+ page = find_lock_page(mapping, idx);
+ if (!page)
+ goto out;
+ } else if (!*pagep) {
ret = -ENOMEM;
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page))
*/
__SetPageUptodate(page);
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
- /*
- * If shared, add to page cache
- */
- if (vm_shared) {
+ /* Add shared, newly allocated pages to the page cache. */
+ if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
- _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
- if (dst_vma->vm_flags & VM_WRITE)
+ /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+ if (is_continue && !vm_shared)
+ writable = 0;
+ else
+ writable = dst_vma->vm_flags & VM_WRITE;
+
+ _dst_pte = make_huge_pte(dst_vma, page, writable);
+ if (writable)
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
- SetHPageMigratable(page);
- if (vm_shared)
+ if (!is_continue)
+ SetHPageMigratable(page);
+ if (vm_shared || is_continue)
unlock_page(page);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
- if (vm_shared)
+ if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
put_page(page);
goto out;
}
+#endif /* CONFIG_USERFAULTFD */
static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
int refs, struct page **pages,
v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
/*
- * vma need span at least one aligned PUD size and the start,end range
- * must at least partialy within it.
+ * vma needs to span at least one aligned PUD size, and the range
+ * must be at least partially within in.
*/
if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
(*end <= v_start) || (*start >= v_end))