mm/hugetlb: document the reserve map/region tracking routines

[linux-2.6-microblaze.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index c41b2a0..4a1d702 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
  int hugetlb_max_hstate __read_mostly;
  unsigned int default_hstate_idx;
  struct hstate hstates[HUGE_MAX_HSTATE];
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
  
  __initdata LIST_HEAD(huge_boot_pages);
  
@@ -61,6 +66,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
  static int num_fault_mutexes;
  static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
  
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
  static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
  {
         bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +76,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
         spin_unlock(&spool->lock);
  
         /* If no pages are used, and no other handles to the subpool
-        * remain, free the subpool the subpool remain */
-       if (free)
+        * remain, give up any reservations mased on minimum size and
+        * free the subpool */
+       if (free) {
+               if (spool->min_hpages != -1)
+                       hugetlb_acct_memory(spool->hstate,
+                                               -spool->min_hpages);
                 kfree(spool);
+       }
  }
  
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+                                               long min_hpages)
  {
         struct hugepage_subpool *spool;
  
-       spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+       spool = kzalloc(sizeof(*spool), GFP_KERNEL);
         if (!spool)
                 return NULL;
  
         spin_lock_init(&spool->lock);
         spool->count = 1;
-       spool->max_hpages = nr_blocks;
-       spool->used_hpages = 0;
+       spool->max_hpages = max_hpages;
+       spool->hstate = h;
+       spool->min_hpages = min_hpages;
+
+       if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+               kfree(spool);
+               return NULL;
+       }
+       spool->rsv_hpages = min_hpages;
  
         return spool;
  }
@@ -97,36 +118,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
         unlock_or_release_subpool(spool);
  }
  
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request.  Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward).  The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                       long delta)
  {
-       int ret = 0;
+       long ret = delta;
  
         if (!spool)
-               return 0;
+               return ret;
  
         spin_lock(&spool->lock);
-       if ((spool->used_hpages + delta) <= spool->max_hpages) {
-               spool->used_hpages += delta;
-       } else {
-               ret = -ENOMEM;
+
+       if (spool->max_hpages != -1) {          /* maximum size accounting */
+               if ((spool->used_hpages + delta) <= spool->max_hpages)
+                       spool->used_hpages += delta;
+               else {
+                       ret = -ENOMEM;
+                       goto unlock_ret;
+               }
+       }
+
+       if (spool->min_hpages != -1) {          /* minimum size accounting */
+               if (delta > spool->rsv_hpages) {
+                       /*
+                        * Asking for more reserves than those already taken on
+                        * behalf of subpool.  Return difference.
+                        */
+                       ret = delta - spool->rsv_hpages;
+                       spool->rsv_hpages = 0;
+               } else {
+                       ret = 0;        /* reserves already accounted for */
+                       spool->rsv_hpages -= delta;
+               }
         }
-       spin_unlock(&spool->lock);
  
+unlock_ret:
+       spin_unlock(&spool->lock);
         return ret;
  }
  
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                        long delta)
  {
+       long ret = delta;
+
         if (!spool)
-               return;
+               return delta;
  
         spin_lock(&spool->lock);
-       spool->used_hpages -= delta;
-       /* If hugetlbfs_put_super couldn't free spool due to
-       * an outstanding quota reference, free it now. */
+
+       if (spool->max_hpages != -1)            /* maximum size accounting */
+               spool->used_hpages -= delta;
+
+       if (spool->min_hpages != -1) {          /* minimum size accounting */
+               if (spool->rsv_hpages + delta <= spool->min_hpages)
+                       ret = 0;
+               else
+                       ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+               spool->rsv_hpages += delta;
+               if (spool->rsv_hpages > spool->min_hpages)
+                       spool->rsv_hpages = spool->min_hpages;
+       }
+
+       /*
+        * If hugetlbfs_put_super couldn't free spool due to an outstanding
+        * quota reference, free it now.
+        */
         unlock_or_release_subpool(spool);
+
+       return ret;
  }
  
  static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -143,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
   * Region tracking -- allows tracking of reservations and instantiated pages
   *                    across the pages in a mapping.
   *
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are embedded into a resv_map and protected
+ * by a resv_map's lock.  The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map.  The from and to elements are huge page
+ * indicies into the associated mapping.  from indicates the starting index
+ * of the region.  to represents the first index past the end of  the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping.  It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
   */
  struct file_region {
         struct list_head link;
@@ -152,6 +238,14 @@ struct file_region {
         long to;
  };
  
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map.  Existing regions will be expanded to accommodate the
+ * specified range.  We know only existing regions need to be
+ * expanded, because region_add is only called after region_chg
+ * with the same range.  If a new file_region structure must
+ * be allocated, it is done in region_chg.
+ */
  static long region_add(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
@@ -191,6 +285,25 @@ static long region_add(struct resv_map *resv, long f, long t)
         return 0;
  }
  
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented.  This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t).  region_chg does
+ * not change the number of huge pages represented by the
+ * map.  However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder.  This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Returns the number of huge pages that need to be added
+ * to the existing reservation map for the range [f, t).
+ * This number is greater or equal to zero.  -ENOMEM is
+ * returned if a new file_region structure is needed and can
+ * not be allocated.
+ */
  static long region_chg(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
@@ -257,6 +370,11 @@ out_nrg:
         return chg;
  }
  
+/*
+ * Truncate the reserve map at index 'end'.  Modify/truncate any
+ * region which contains end.  Delete any regions past end.
+ * Return the number of huge pages removed from the map.
+ */
  static long region_truncate(struct resv_map *resv, long end)
  {
         struct list_head *head = &resv->regions;
@@ -292,6 +410,10 @@ out:
         return chg;
  }
  
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
  static long region_count(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
@@ -855,6 +977,31 @@ struct hstate *size_to_hstate(unsigned long size)
         return NULL;
  }
  
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+       VM_BUG_ON_PAGE(!PageHuge(page), page);
+       return PageHead(page) && PagePrivate(&page[1]);
+}
+
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+       VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+       SetPagePrivate(&page[1]);
+}
+
+static void clear_page_huge_active(struct page *page)
+{
+       VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+       ClearPagePrivate(&page[1]);
+}
+
  void free_huge_page(struct page *page)
  {
         /*
@@ -874,7 +1021,16 @@ void free_huge_page(struct page *page)
         restore_reserve = PagePrivate(page);
         ClearPagePrivate(page);
  
+       /*
+        * A return code of zero implies that the subpool will be under its
+        * minimum size if the reservation is not restored after page is free.
+        * Therefore, force restore_reserve operation.
+        */
+       if (hugepage_subpool_put_pages(spool, 1) == 0)
+               restore_reserve = true;
+
         spin_lock(&hugetlb_lock);
+       clear_page_huge_active(page);
         hugetlb_cgroup_uncharge_page(hstate_index(h),
                                      pages_per_huge_page(h), page);
         if (restore_reserve)
@@ -891,7 +1047,6 @@ void free_huge_page(struct page *page)
                 enqueue_huge_page(h, page);
         }
         spin_unlock(&hugetlb_lock);
-       hugepage_subpool_put_pages(spool, 1);
  }
  
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1086,19 +1241,13 @@ static void dissolve_free_huge_page(struct page *page)
   */
  void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
  {
-       unsigned int order = 8 * sizeof(void *);
         unsigned long pfn;
-       struct hstate *h;
  
         if (!hugepages_supported())
                 return;
  
-       /* Set scan step to minimum hugepage size */
-       for_each_hstate(h)
-               if (order > huge_page_order(h))
-                       order = huge_page_order(h);
-       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
-       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
                 dissolve_free_huge_page(pfn_to_page(pfn));
  }
  
@@ -1386,7 +1535,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         if (chg < 0)
                 return ERR_PTR(-ENOMEM);
         if (chg || avoid_reserve)
-               if (hugepage_subpool_get_pages(spool, 1))
+               if (hugepage_subpool_get_pages(spool, 1) < 0)
                         return ERR_PTR(-ENOSPC);
  
         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -1525,10 +1674,14 @@ static void __init hugetlb_init_hstates(void)
         struct hstate *h;
  
         for_each_hstate(h) {
+               if (minimum_order > huge_page_order(h))
+                       minimum_order = huge_page_order(h);
+
                 /* oversize hugepages were init'ed in early boot */
                 if (!hstate_is_gigantic(h))
                         hugetlb_hstate_alloc_pages(h);
         }
+       VM_BUG_ON(minimum_order == UINT_MAX);
  }
  
  static char * __init memfmt(char *buf, unsigned long n)
@@ -2454,6 +2607,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
         struct resv_map *resv = vma_resv_map(vma);
         struct hugepage_subpool *spool = subpool_vma(vma);
         unsigned long reserve, start, end;
+       long gbl_reserve;
  
         if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                 return;
@@ -2466,8 +2620,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
         kref_put(&resv->refs, resv_map_release);
  
         if (reserve) {
-               hugetlb_acct_memory(h, -reserve);
-               hugepage_subpool_put_pages(spool, reserve);
+               /*
+                * Decrement reserve counts.  The global reserve count may be
+                * adjusted if the subpool has a minimum size.
+                */
+               gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+               hugetlb_acct_memory(h, -gbl_reserve);
         }
  }
  
@@ -2891,6 +3049,7 @@ retry_avoidcopy:
         copy_user_huge_page(new_page, old_page, address, vma,
                             pages_per_huge_page(h));
         __SetPageUptodate(new_page);
+       set_page_huge_active(new_page);
  
         mmun_start = address & huge_page_mask(h);
         mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3162,7 @@ retry:
                 }
                 clear_huge_page(page, address, pages_per_huge_page(h));
                 __SetPageUptodate(page);
+               set_page_huge_active(page);
  
                 if (vma->vm_flags & VM_MAYSHARE) {
                         int err;
@@ -3277,6 +3437,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 int absent;
                 struct page *page;
  
+               /*
+                * If we have a pending SIGKILL, don't keep faulting pages and
+                * potentially allocating memory.
+                */
+               if (unlikely(fatal_signal_pending(current))) {
+                       remainder = 0;
+                       break;
+               }
+
                 /*
                  * Some archs (sparc64, sh*) have multiple pte_ts to
                  * each hugepage.  We have to make sure we get the
@@ -3438,6 +3607,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         struct hstate *h = hstate_inode(inode);
         struct hugepage_subpool *spool = subpool_inode(inode);
         struct resv_map *resv_map;
+       long gbl_reserve;
  
         /*
          * Only apply hugepage reservation if asked. At fault time, an
@@ -3474,8 +3644,13 @@ int hugetlb_reserve_pages(struct inode *inode,
                 goto out_err;
         }
  
-       /* There must be enough pages in the subpool for the mapping */
-       if (hugepage_subpool_get_pages(spool, chg)) {
+       /*
+        * There must be enough pages in the subpool for the mapping. If
+        * the subpool has a minimum size, there may be some global
+        * reservations already in place (gbl_reserve).
+        */
+       gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+       if (gbl_reserve < 0) {
                 ret = -ENOSPC;
                 goto out_err;
         }
@@ -3484,9 +3659,10 @@ int hugetlb_reserve_pages(struct inode *inode,
          * Check enough hugepages are available for the reservation.
          * Hand the pages back to the subpool if there are not
          */
-       ret = hugetlb_acct_memory(h, chg);
+       ret = hugetlb_acct_memory(h, gbl_reserve);
         if (ret < 0) {
-               hugepage_subpool_put_pages(spool, chg);
+               /* put back original number of pages, chg */
+               (void)hugepage_subpool_put_pages(spool, chg);
                 goto out_err;
         }
  
@@ -3516,6 +3692,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
         struct resv_map *resv_map = inode_resv_map(inode);
         long chg = 0;
         struct hugepage_subpool *spool = subpool_inode(inode);
+       long gbl_reserve;
  
         if (resv_map)
                 chg = region_truncate(resv_map, offset);
@@ -3523,8 +3700,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
         spin_unlock(&inode->i_lock);
  
-       hugepage_subpool_put_pages(spool, (chg - freed));
-       hugetlb_acct_memory(h, -(chg - freed));
+       /*
+        * If the subpool has a minimum size, the number of global
+        * reservations to be released may be adjusted.
+        */
+       gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+       hugetlb_acct_memory(h, -gbl_reserve);
  }
  
  #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3659,6 +3840,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
  {
         return NULL;
  }
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
  #define want_pmd_share()       (0)
  #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
  
@@ -3735,8 +3921,7 @@ retry:
         if (!pmd_huge(*pmd))
                 goto out;
         if (pmd_present(*pmd)) {
-               page = pte_page(*(pte_t *)pmd) +
-                       ((address & ~PMD_MASK) >> PAGE_SHIFT);
+               page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
                 if (flags & FOLL_GET)
                         get_page(page);
         } else {
@@ -3767,20 +3952,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
  
  #ifdef CONFIG_MEMORY_FAILURE
  
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
-       struct page *page;
-       struct page *tmp;
-       struct hstate *h = page_hstate(hpage);
-       int nid = page_to_nid(hpage);
-
-       list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
-               if (page == hpage)
-                       return 1;
-       return 0;
-}
-
  /*
   * This function is called from memory failure code.
   * Assume the caller holds page lock of the head page.
@@ -3792,7 +3963,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
         int ret = -EBUSY;
  
         spin_lock(&hugetlb_lock);
-       if (is_hugepage_on_freelist(hpage)) {
+       /*
+        * Just checking !page_huge_active is not enough, because that could be
+        * an isolated/hwpoisoned hugepage (which have >0 refcount).
+        */
+       if (!page_huge_active(hpage) && !page_count(hpage)) {
                 /*
                  * Hwpoisoned hugepage isn't linked to activelist or freelist,
                  * but dangling hpage->lru can trigger list-debug warnings
@@ -3812,42 +3987,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
  
  bool isolate_huge_page(struct page *page, struct list_head *list)
  {
+       bool ret = true;
+
         VM_BUG_ON_PAGE(!PageHead(page), page);
-       if (!get_page_unless_zero(page))
-               return false;
         spin_lock(&hugetlb_lock);
+       if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+               ret = false;
+               goto unlock;
+       }
+       clear_page_huge_active(page);
         list_move_tail(&page->lru, list);
+unlock:
         spin_unlock(&hugetlb_lock);
-       return true;
+       return ret;
  }
  
  void putback_active_hugepage(struct page *page)
  {
         VM_BUG_ON_PAGE(!PageHead(page), page);
         spin_lock(&hugetlb_lock);
+       set_page_huge_active(page);
         list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
         spin_unlock(&hugetlb_lock);
         put_page(page);
  }
-
-bool is_hugepage_active(struct page *page)
-{
-       VM_BUG_ON_PAGE(!PageHuge(page), page);
-       /*
-        * This function can be called for a tail page because the caller,
-        * scan_movable_pages, scans through a given pfn-range which typically
-        * covers one memory block. In systems using gigantic hugepage (1GB
-        * for x86_64,) a hugepage is larger than a memory block, and we don't
-        * support migrating such large hugepages for now, so return false
-        * when called for tail pages.
-        */
-       if (PageTail(page))
-               return false;
-       /*
-        * Refcount of a hwpoisoned hugepages is 1, but they are not active,
-        * so we should return false for them.
-        */
-       if (unlikely(PageHWPoison(page)))
-               return false;
-       return page_count(page) > 0;
-}