Merge tag 'hwlock-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/andersson...

[linux-2.6-microblaze.git] / mm / swapfile.c
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 5871a2a..6c26916 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -40,7 +40,6 @@
  #include <linux/swap_slots.h>
  #include <linux/sort.h>
  
-#include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <linux/swapops.h>
  #include <linux/swap_cgroup.h>
@@ -601,7 +600,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
  {
         struct percpu_cluster *cluster;
         struct swap_cluster_info *ci;
-       bool found_free;
         unsigned long tmp, max;
  
  new_cluster:
@@ -614,17 +612,17 @@ new_cluster:
                 } else if (!cluster_list_empty(&si->discard_clusters)) {
                         /*
                          * we don't have free cluster but have some clusters in
-                        * discarding, do discard now and reclaim them
+                        * discarding, do discard now and reclaim them, then
+                        * reread cluster_next_cpu since we dropped si->lock
                          */
                         swap_do_scheduled_discard(si);
-                       *scan_base = *offset = si->cluster_next;
+                       *scan_base = this_cpu_read(*si->cluster_next_cpu);
+                       *offset = *scan_base;
                         goto new_cluster;
                 } else
                         return false;
         }
  
-       found_free = false;
-
         /*
          * Other CPUs can use our cluster if they can't find a free cluster,
          * check if there is still free entry in the cluster
@@ -632,27 +630,23 @@ new_cluster:
         tmp = cluster->next;
         max = min_t(unsigned long, si->max,
                     (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
-       if (tmp >= max) {
-               cluster_set_null(&cluster->index);
-               goto new_cluster;
-       }
-       ci = lock_cluster(si, tmp);
-       while (tmp < max) {
-               if (!si->swap_map[tmp]) {
-                       found_free = true;
-                       break;
+       if (tmp < max) {
+               ci = lock_cluster(si, tmp);
+               while (tmp < max) {
+                       if (!si->swap_map[tmp])
+                               break;
+                       tmp++;
                 }
-               tmp++;
+               unlock_cluster(ci);
         }
-       unlock_cluster(ci);
-       if (!found_free) {
+       if (tmp >= max) {
                 cluster_set_null(&cluster->index);
                 goto new_cluster;
         }
         cluster->next = tmp + 1;
         *offset = tmp;
         *scan_base = tmp;
-       return found_free;
+       return true;
  }
  
  static void __del_from_avail_list(struct swap_info_struct *p)
@@ -729,6 +723,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
         }
  }
  
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+       unsigned long prev;
+
+       if (!(si->flags & SWP_SOLIDSTATE)) {
+               si->cluster_next = next;
+               return;
+       }
+
+       prev = this_cpu_read(*si->cluster_next_cpu);
+       /*
+        * Cross the swap address space size aligned trunk, choose
+        * another trunk randomly to avoid lock contention on swap
+        * address space if possible.
+        */
+       if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+           (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+               /* No free swap slots available */
+               if (si->highest_bit <= si->lowest_bit)
+                       return;
+               next = si->lowest_bit +
+                       prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+               next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+               next = max_t(unsigned int, next, si->lowest_bit);
+       }
+       this_cpu_write(*si->cluster_next_cpu, next);
+}
+
  static int scan_swap_map_slots(struct swap_info_struct *si,
                                unsigned char usage, int nr,
                                swp_entry_t slots[])
@@ -739,9 +761,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
         unsigned long last_in_cluster = 0;
         int latency_ration = LATENCY_LIMIT;
         int n_ret = 0;
-
-       if (nr > SWAP_BATCH)
-               nr = SWAP_BATCH;
+       bool scanned_many = false;
  
         /*
          * We try to cluster swap pages by allocating them sequentially
@@ -755,17 +775,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
          */
  
         si->flags += SWP_SCANNING;
-       scan_base = offset = si->cluster_next;
+       /*
+        * Use percpu scan base for SSD to reduce lock contention on
+        * cluster and swap cache.  For HDD, sequential access is more
+        * important.
+        */
+       if (si->flags & SWP_SOLIDSTATE)
+               scan_base = this_cpu_read(*si->cluster_next_cpu);
+       else
+               scan_base = si->cluster_next;
+       offset = scan_base;
  
         /* SSD algorithm */
         if (si->cluster_info) {
-               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
-                       goto checks;
-               else
+               if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
                         goto scan;
-       }
-
-       if (unlikely(!si->cluster_nr--)) {
+       } else if (unlikely(!si->cluster_nr--)) {
                 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                         si->cluster_nr = SWAPFILE_CLUSTER - 1;
                         goto checks;
@@ -848,7 +873,6 @@ checks:
         unlock_cluster(ci);
  
         swap_range_alloc(si, offset, 1);
-       si->cluster_next = offset + 1;
         slots[n_ret++] = swp_entry(si->type, offset);
  
         /* got enough slots or reach max slots? */
@@ -871,19 +895,33 @@ checks:
         if (si->cluster_info) {
                 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
                         goto checks;
-               else
-                       goto done;
-       }
-       /* non-ssd case */
-       ++offset;
-
-       /* non-ssd case, still more slots in cluster? */
-       if (si->cluster_nr && !si->swap_map[offset]) {
+       } else if (si->cluster_nr && !si->swap_map[++offset]) {
+               /* non-ssd case, still more slots in cluster? */
                 --si->cluster_nr;
                 goto checks;
         }
  
+       /*
+        * Even if there's no free clusters available (fragmented),
+        * try to scan a little more quickly with lock held unless we
+        * have scanned too many slots already.
+        */
+       if (!scanned_many) {
+               unsigned long scan_limit;
+
+               if (offset < scan_base)
+                       scan_limit = scan_base;
+               else
+                       scan_limit = si->highest_bit;
+               for (; offset <= scan_limit && --latency_ration > 0;
+                    offset++) {
+                       if (!si->swap_map[offset])
+                               goto checks;
+               }
+       }
+
  done:
+       set_cluster_next(si, offset + 1);
         si->flags -= SWP_SCANNING;
         return n_ret;
  
@@ -901,6 +939,7 @@ scan:
                 if (unlikely(--latency_ration < 0)) {
                         cond_resched();
                         latency_ration = LATENCY_LIMIT;
+                       scanned_many = true;
                 }
         }
         offset = si->lowest_bit;
@@ -916,6 +955,7 @@ scan:
                 if (unlikely(--latency_ration < 0)) {
                         cond_resched();
                         latency_ration = LATENCY_LIMIT;
+                       scanned_many = true;
                 }
                 offset++;
         }
@@ -1004,11 +1044,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
         if (avail_pgs <= 0)
                 goto noswap;
  
-       if (n_goal > SWAP_BATCH)
-               n_goal = SWAP_BATCH;
-
-       if (n_goal > avail_pgs)
-               n_goal = avail_pgs;
+       n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
  
         atomic_long_sub(n_goal * size, &nr_swap_pages);
  
@@ -1275,13 +1311,14 @@ unlock_out:
  }
  
  static unsigned char __swap_entry_free(struct swap_info_struct *p,
-                                      swp_entry_t entry, unsigned char usage)
+                                      swp_entry_t entry)
  {
         struct swap_cluster_info *ci;
         unsigned long offset = swp_offset(entry);
+       unsigned char usage;
  
         ci = lock_cluster_or_swap_info(p, offset);
-       usage = __swap_entry_free_locked(p, offset, usage);
+       usage = __swap_entry_free_locked(p, offset, 1);
         unlock_cluster_or_swap_info(p, ci);
         if (!usage)
                 free_swap_slot(entry);
@@ -1316,7 +1353,7 @@ void swap_free(swp_entry_t entry)
  
         p = _swap_info_get(entry);
         if (p)
-               __swap_entry_free(p, entry, 1);
+               __swap_entry_free(p, entry);
  }
  
  /*
@@ -1739,7 +1776,7 @@ int free_swap_and_cache(swp_entry_t entry)
  
         p = _swap_info_get(entry);
         if (p) {
-               count = __swap_entry_free(p, entry, 1);
+               count = __swap_entry_free(p, entry);
                 if (count == SWAP_HAS_CACHE &&
                     !swap_page_trans_huge_swapped(p, entry))
                         __try_to_reclaim_swap(p, swp_offset(entry),
@@ -1854,7 +1891,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr, swp_entry_t entry, struct page *page)
  {
         struct page *swapcache;
-       struct mem_cgroup *memcg;
         spinlock_t *ptl;
         pte_t *pte;
         int ret = 1;
@@ -1864,15 +1900,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
         if (unlikely(!page))
                 return -ENOMEM;
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
-                               &memcg, false)) {
-               ret = -ENOMEM;
-               goto out_nolock;
-       }
-
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
-               mem_cgroup_cancel_charge(page, memcg, false);
                 ret = 0;
                 goto out;
         }
@@ -1884,10 +1913,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                    pte_mkold(mk_pte(page, vma->vm_page_prot)));
         if (page == swapcache) {
                 page_add_anon_rmap(page, vma, addr, false);
-               mem_cgroup_commit_charge(page, memcg, true, false);
         } else { /* ksm created a completely new copy */
                 page_add_new_anon_rmap(page, vma, addr, false);
-               mem_cgroup_commit_charge(page, memcg, false, false);
                 lru_cache_add_active_or_unevictable(page, vma);
         }
         swap_free(entry);
@@ -1898,7 +1925,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
         activate_page(page);
  out:
         pte_unmap_unlock(pte, ptl);
-out_nolock:
         if (page != swapcache) {
                 unlock_page(page);
                 put_page(page);
@@ -1937,10 +1963,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  
                 pte_unmap(pte);
                 swap_map = &si->swap_map[offset];
-               vmf.vma = vma;
-               vmf.address = addr;
-               vmf.pmd = pmd;
-               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+               page = lookup_swap_cache(entry, vma, addr);
+               if (!page) {
+                       vmf.vma = vma;
+                       vmf.address = addr;
+                       vmf.pmd = pmd;
+                       page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+                                               &vmf);
+               }
                 if (!page) {
                         if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
                                 goto try_next;
@@ -2070,7 +2100,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
         struct vm_area_struct *vma;
         int ret = 0;
  
-       down_read(&mm->mmap_sem);
+       mmap_read_lock(mm);
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
                 if (vma->anon_vma) {
                         ret = unuse_vma(vma, type, frontswap,
@@ -2080,7 +2110,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
                 }
                 cond_resched();
         }
-       up_read(&mm->mmap_sem);
+       mmap_read_unlock(mm);
         return ret;
  }
  
@@ -2650,6 +2680,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         mutex_unlock(&swapon_mutex);
         free_percpu(p->percpu_cluster);
         p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
         vfree(swap_map);
         kvfree(cluster_info);
         kvfree(frontswap_map);
@@ -2757,20 +2789,24 @@ static int swap_show(struct seq_file *swap, void *v)
         struct swap_info_struct *si = v;
         struct file *file;
         int len;
+       unsigned int bytes, inuse;
  
         if (si == SEQ_START_TOKEN) {
-               seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+               seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                 return 0;
         }
  
+       bytes = si->pages << (PAGE_SHIFT - 10);
+       inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+
         file = si->swap_file;
         len = seq_file_path(swap, file, " \t\n\\");
-       seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+       seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
                         len < 40 ? 40 - len : 1, " ",
                         S_ISBLK(file_inode(file)->i_mode) ?
                                 "partition" : "file\t",
-                       si->pages << (PAGE_SHIFT - 10),
-                       si->inuse_pages << (PAGE_SHIFT - 10),
+                       bytes, bytes < 10000000 ? "\t" : "",
+                       inuse, inuse < 10000000 ? "\t" : "",
                         si->prio);
         return 0;
  }
@@ -2893,7 +2929,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
                  * write only restriction.  Hence zoned block devices are not
                  * suitable for swapping.  Disallow them here.
                  */
-               if (blk_queue_is_zoned(p->bdev->bd_queue))
+               if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
                         return -EINVAL;
                 p->flags |= SWP_BLKDEV;
         } else if (S_ISREG(inode->i_mode)) {
@@ -3202,11 +3238,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                 unsigned long ci, nr_cluster;
  
                 p->flags |= SWP_SOLIDSTATE;
+               p->cluster_next_cpu = alloc_percpu(unsigned int);
+               if (!p->cluster_next_cpu) {
+                       error = -ENOMEM;
+                       goto bad_swap_unlock_inode;
+               }
                 /*
                  * select a random position to start with to help wear leveling
                  * SSD
                  */
-               p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+               for_each_possible_cpu(cpu) {
+                       per_cpu(*p->cluster_next_cpu, cpu) =
+                               1 + prandom_u32_max(p->highest_bit);
+               }
                 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  
                 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
@@ -3322,6 +3366,8 @@ bad_swap_unlock_inode:
  bad_swap:
         free_percpu(p->percpu_cluster);
         p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
         if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                 set_blocksize(p->bdev, p->old_block_size);
                 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -3654,7 +3700,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
  
         spin_lock(&si->cont_lock);
         offset &= ~PAGE_MASK;
-       page = list_entry(head->lru.next, struct page, lru);
+       page = list_next_entry(head, lru);
         map = kmap_atomic(page) + offset;
  
         if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
@@ -3666,13 +3712,13 @@ static bool swap_count_continued(struct swap_info_struct *si,
                  */
                 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
                         kunmap_atomic(map);
-                       page = list_entry(page->lru.next, struct page, lru);
+                       page = list_next_entry(page, lru);
                         BUG_ON(page == head);
                         map = kmap_atomic(page) + offset;
                 }
                 if (*map == SWAP_CONT_MAX) {
                         kunmap_atomic(map);
-                       page = list_entry(page->lru.next, struct page, lru);
+                       page = list_next_entry(page, lru);
                         if (page == head) {
                                 ret = false;    /* add count continuation */
                                 goto out;
@@ -3682,12 +3728,10 @@ init_map:               *map = 0;               /* we didn't zero the page */
                 }
                 *map += 1;
                 kunmap_atomic(map);
-               page = list_entry(page->lru.prev, struct page, lru);
-               while (page != head) {
+               while ((page = list_prev_entry(page, lru)) != head) {
                         map = kmap_atomic(page) + offset;
                         *map = COUNT_CONTINUED;
                         kunmap_atomic(map);
-                       page = list_entry(page->lru.prev, struct page, lru);
                 }
                 ret = true;                     /* incremented */
  
@@ -3698,7 +3742,7 @@ init_map:         *map = 0;               /* we didn't zero the page */
                 BUG_ON(count != COUNT_CONTINUED);
                 while (*map == COUNT_CONTINUED) {
                         kunmap_atomic(map);
-                       page = list_entry(page->lru.next, struct page, lru);
+                       page = list_next_entry(page, lru);
                         BUG_ON(page == head);
                         map = kmap_atomic(page) + offset;
                 }
@@ -3707,13 +3751,11 @@ init_map:               *map = 0;               /* we didn't zero the page */
                 if (*map == 0)
                         count = 0;
                 kunmap_atomic(map);
-               page = list_entry(page->lru.prev, struct page, lru);
-               while (page != head) {
+               while ((page = list_prev_entry(page, lru)) != head) {
                         map = kmap_atomic(page) + offset;
                         *map = SWAP_CONT_MAX | count;
                         count = COUNT_CONTINUED;
                         kunmap_atomic(map);
-                       page = list_entry(page->lru.prev, struct page, lru);
                 }
                 ret = count == COUNT_CONTINUED;
         }
@@ -3745,11 +3787,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
  }
  
  #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
-                                 gfp_t gfp_mask)
+void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
  {
         struct swap_info_struct *si, *next;
-       if (!(gfp_mask & __GFP_IO) || !memcg)
+       int nid = page_to_nid(page);
+
+       if (!(gfp_mask & __GFP_IO))
                 return;
  
         if (!blk_cgroup_congested())
@@ -3763,11 +3806,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
                 return;
  
         spin_lock(&swap_avail_lock);
-       plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
-                                 avail_lists[node]) {
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
+                                 avail_lists[nid]) {
                 if (si->bdev) {
-                       blkcg_schedule_throttle(bdev_get_queue(si->bdev),
-                                               true);
+                       blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
                         break;
                 }
         }