#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
- * the request. Otherwise, return the number of pages by which the
+ * request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
* a subpool minimum size must be maintained.
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
+ if (nocma && is_migrate_cma_page(page))
+ continue;
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
if (!PageHWPoison(page))
break;
+ }
+
/*
* if 'non-isolated free hugepage' not found on the list,
* the allocation fails.
return NULL;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
int nid, nodemask_t *nodemask)
{
unsigned long nr_pages = 1UL << huge_page_order(h);
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
#ifdef CONFIG_CMA
{
struct page *page;
int node;
- for_each_node_mask(node, *nodemask) {
- if (!hugetlb_cma[node])
- continue;
-
- page = cma_alloc(hugetlb_cma[node], nr_pages,
- huge_page_order(h), true);
+ if (hugetlb_cma[nid]) {
+ page = cma_alloc(hugetlb_cma[nid], nr_pages,
+ huge_page_order(h), true);
if (page)
return page;
}
+
+ if (!(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
+
+ page = cma_alloc(hugetlb_cma[node], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
+ }
+ }
}
#endif
return page;
}
-struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
return page;
}
-/* page migration callback function */
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
-{
- gfp_t gfp_mask = htlb_alloc_mask(h);
- struct page *page = NULL;
-
- if (nid != NUMA_NO_NODE)
- gfp_mask |= __GFP_THISNODE;
-
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
- spin_unlock(&hugetlb_lock);
-
- if (!page)
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
-
- return page;
-}
-
/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask)
+ nodemask_t *nmask, gfp_t gfp_mask)
{
- gfp_t gfp_mask = htlb_alloc_mask(h);
-
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the the freed pages across the
+ * free_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*
* Note that we decrement resv_huge_pages as we free the pages. If
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
+ nodemask_t *mpol_allowed;
+ unsigned int *array = h->free_huge_pages_node;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
- for_each_node_mask(node, cpuset_current_mems_allowed)
- nr += array[node];
+ mpol_allowed = policy_nodemask_current(gfp_mask);
+
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
+ if (!mpol_allowed ||
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
+ nr += array[node];
+ }
return nr;
}
#ifdef CONFIG_SYSCTL
+static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos, unsigned long *out)
+{
+ struct ctl_table dup_table;
+
+ /*
+ * In order to avoid races with __do_proc_doulongvec_minmax(), we
+ * can duplicate the @table and alter the duplicate of it.
+ */
+ dup_table = *table;
+ dup_table.data = out;
+
+ return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
+}
+
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
if (!hugepages_supported())
return -EOPNOTSUPP;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
if (write && hstate_is_gigantic(h))
return -EINVAL;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
+ *
+ * Apart from cpuset, we also have memory policy mechanism that
+ * also determines from which node the kernel will allocate memory
+ * in a NUMA system. So similar to cpuset, we also should consider
+ * the memory policy of the current task. Similar to the description
+ * above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
/*
* We just unmapped a page of PMDs by clearing a PUD.
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
/*
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
return 0;
}
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
+ char name[20];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
+ snprintf(name, 20, "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
- 0, false, "hugetlb",
+ 0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",