Merge tag 'hwlock-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/andersson...
[linux-2.6-microblaze.git] / mm / swapfile.c
index 5871a2a..6c26916 100644 (file)
@@ -40,7 +40,6 @@
 #include <linux/swap_slots.h>
 #include <linux/sort.h>
 
-#include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 #include <linux/swap_cgroup.h>
@@ -601,7 +600,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 {
        struct percpu_cluster *cluster;
        struct swap_cluster_info *ci;
-       bool found_free;
        unsigned long tmp, max;
 
 new_cluster:
@@ -614,17 +612,17 @@ new_cluster:
                } else if (!cluster_list_empty(&si->discard_clusters)) {
                        /*
                         * we don't have free cluster but have some clusters in
-                        * discarding, do discard now and reclaim them
+                        * discarding, do discard now and reclaim them, then
+                        * reread cluster_next_cpu since we dropped si->lock
                         */
                        swap_do_scheduled_discard(si);
-                       *scan_base = *offset = si->cluster_next;
+                       *scan_base = this_cpu_read(*si->cluster_next_cpu);
+                       *offset = *scan_base;
                        goto new_cluster;
                } else
                        return false;
        }
 
-       found_free = false;
-
        /*
         * Other CPUs can use our cluster if they can't find a free cluster,
         * check if there is still free entry in the cluster
@@ -632,27 +630,23 @@ new_cluster:
        tmp = cluster->next;
        max = min_t(unsigned long, si->max,
                    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
-       if (tmp >= max) {
-               cluster_set_null(&cluster->index);
-               goto new_cluster;
-       }
-       ci = lock_cluster(si, tmp);
-       while (tmp < max) {
-               if (!si->swap_map[tmp]) {
-                       found_free = true;
-                       break;
+       if (tmp < max) {
+               ci = lock_cluster(si, tmp);
+               while (tmp < max) {
+                       if (!si->swap_map[tmp])
+                               break;
+                       tmp++;
                }
-               tmp++;
+               unlock_cluster(ci);
        }
-       unlock_cluster(ci);
-       if (!found_free) {
+       if (tmp >= max) {
                cluster_set_null(&cluster->index);
                goto new_cluster;
        }
        cluster->next = tmp + 1;
        *offset = tmp;
        *scan_base = tmp;
-       return found_free;
+       return true;
 }
 
 static void __del_from_avail_list(struct swap_info_struct *p)
@@ -729,6 +723,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
        }
 }
 
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+       unsigned long prev;
+
+       if (!(si->flags & SWP_SOLIDSTATE)) {
+               si->cluster_next = next;
+               return;
+       }
+
+       prev = this_cpu_read(*si->cluster_next_cpu);
+       /*
+        * Cross the swap address space size aligned trunk, choose
+        * another trunk randomly to avoid lock contention on swap
+        * address space if possible.
+        */
+       if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+           (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+               /* No free swap slots available */
+               if (si->highest_bit <= si->lowest_bit)
+                       return;
+               next = si->lowest_bit +
+                       prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+               next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+               next = max_t(unsigned int, next, si->lowest_bit);
+       }
+       this_cpu_write(*si->cluster_next_cpu, next);
+}
+
 static int scan_swap_map_slots(struct swap_info_struct *si,
                               unsigned char usage, int nr,
                               swp_entry_t slots[])
@@ -739,9 +761,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
        int n_ret = 0;
-
-       if (nr > SWAP_BATCH)
-               nr = SWAP_BATCH;
+       bool scanned_many = false;
 
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -755,17 +775,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
         */
 
        si->flags += SWP_SCANNING;
-       scan_base = offset = si->cluster_next;
+       /*
+        * Use percpu scan base for SSD to reduce lock contention on
+        * cluster and swap cache.  For HDD, sequential access is more
+        * important.
+        */
+       if (si->flags & SWP_SOLIDSTATE)
+               scan_base = this_cpu_read(*si->cluster_next_cpu);
+       else
+               scan_base = si->cluster_next;
+       offset = scan_base;
 
        /* SSD algorithm */
        if (si->cluster_info) {
-               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
-                       goto checks;
-               else
+               if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
                        goto scan;
-       }
-
-       if (unlikely(!si->cluster_nr--)) {
+       } else if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
@@ -848,7 +873,6 @@ checks:
        unlock_cluster(ci);
 
        swap_range_alloc(si, offset, 1);
-       si->cluster_next = offset + 1;
        slots[n_ret++] = swp_entry(si->type, offset);
 
        /* got enough slots or reach max slots? */
@@ -871,19 +895,33 @@ checks:
        if (si->cluster_info) {
                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
                        goto checks;
-               else
-                       goto done;
-       }
-       /* non-ssd case */
-       ++offset;
-
-       /* non-ssd case, still more slots in cluster? */
-       if (si->cluster_nr && !si->swap_map[offset]) {
+       } else if (si->cluster_nr && !si->swap_map[++offset]) {
+               /* non-ssd case, still more slots in cluster? */
                --si->cluster_nr;
                goto checks;
        }
 
+       /*
+        * Even if there's no free clusters available (fragmented),
+        * try to scan a little more quickly with lock held unless we
+        * have scanned too many slots already.
+        */
+       if (!scanned_many) {
+               unsigned long scan_limit;
+
+               if (offset < scan_base)
+                       scan_limit = scan_base;
+               else
+                       scan_limit = si->highest_bit;
+               for (; offset <= scan_limit && --latency_ration > 0;
+                    offset++) {
+                       if (!si->swap_map[offset])
+                               goto checks;
+               }
+       }
+
 done:
+       set_cluster_next(si, offset + 1);
        si->flags -= SWP_SCANNING;
        return n_ret;
 
@@ -901,6 +939,7 @@ scan:
                if (unlikely(--latency_ration < 0)) {
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
+                       scanned_many = true;
                }
        }
        offset = si->lowest_bit;
@@ -916,6 +955,7 @@ scan:
                if (unlikely(--latency_ration < 0)) {
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
+                       scanned_many = true;
                }
                offset++;
        }
@@ -1004,11 +1044,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
        if (avail_pgs <= 0)
                goto noswap;
 
-       if (n_goal > SWAP_BATCH)
-               n_goal = SWAP_BATCH;
-
-       if (n_goal > avail_pgs)
-               n_goal = avail_pgs;
+       n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
 
        atomic_long_sub(n_goal * size, &nr_swap_pages);
 
@@ -1275,13 +1311,14 @@ unlock_out:
 }
 
 static unsigned char __swap_entry_free(struct swap_info_struct *p,
-                                      swp_entry_t entry, unsigned char usage)
+                                      swp_entry_t entry)
 {
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
+       unsigned char usage;
 
        ci = lock_cluster_or_swap_info(p, offset);
-       usage = __swap_entry_free_locked(p, offset, usage);
+       usage = __swap_entry_free_locked(p, offset, 1);
        unlock_cluster_or_swap_info(p, ci);
        if (!usage)
                free_swap_slot(entry);
@@ -1316,7 +1353,7 @@ void swap_free(swp_entry_t entry)
 
        p = _swap_info_get(entry);
        if (p)
-               __swap_entry_free(p, entry, 1);
+               __swap_entry_free(p, entry);
 }
 
 /*
@@ -1739,7 +1776,7 @@ int free_swap_and_cache(swp_entry_t entry)
 
        p = _swap_info_get(entry);
        if (p) {
-               count = __swap_entry_free(p, entry, 1);
+               count = __swap_entry_free(p, entry);
                if (count == SWAP_HAS_CACHE &&
                    !swap_page_trans_huge_swapped(p, entry))
                        __try_to_reclaim_swap(p, swp_offset(entry),
@@ -1854,7 +1891,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
        struct page *swapcache;
-       struct mem_cgroup *memcg;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
@@ -1864,15 +1900,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        if (unlikely(!page))
                return -ENOMEM;
 
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
-                               &memcg, false)) {
-               ret = -ENOMEM;
-               goto out_nolock;
-       }
-
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
-               mem_cgroup_cancel_charge(page, memcg, false);
                ret = 0;
                goto out;
        }
@@ -1884,10 +1913,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        if (page == swapcache) {
                page_add_anon_rmap(page, vma, addr, false);
-               mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
                page_add_new_anon_rmap(page, vma, addr, false);
-               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
        swap_free(entry);
@@ -1898,7 +1925,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        activate_page(page);
 out:
        pte_unmap_unlock(pte, ptl);
-out_nolock:
        if (page != swapcache) {
                unlock_page(page);
                put_page(page);
@@ -1937,10 +1963,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
                pte_unmap(pte);
                swap_map = &si->swap_map[offset];
-               vmf.vma = vma;
-               vmf.address = addr;
-               vmf.pmd = pmd;
-               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+               page = lookup_swap_cache(entry, vma, addr);
+               if (!page) {
+                       vmf.vma = vma;
+                       vmf.address = addr;
+                       vmf.pmd = pmd;
+                       page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+                                               &vmf);
+               }
                if (!page) {
                        if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
                                goto try_next;
@@ -2070,7 +2100,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
        struct vm_area_struct *vma;
        int ret = 0;
 
-       down_read(&mm->mmap_sem);
+       mmap_read_lock(mm);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (vma->anon_vma) {
                        ret = unuse_vma(vma, type, frontswap,
@@ -2080,7 +2110,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
                }
                cond_resched();
        }
-       up_read(&mm->mmap_sem);
+       mmap_read_unlock(mm);
        return ret;
 }
 
@@ -2650,6 +2680,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
        vfree(swap_map);
        kvfree(cluster_info);
        kvfree(frontswap_map);
@@ -2757,20 +2789,24 @@ static int swap_show(struct seq_file *swap, void *v)
        struct swap_info_struct *si = v;
        struct file *file;
        int len;
+       unsigned int bytes, inuse;
 
        if (si == SEQ_START_TOKEN) {
-               seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+               seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }
 
+       bytes = si->pages << (PAGE_SHIFT - 10);
+       inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+
        file = si->swap_file;
        len = seq_file_path(swap, file, " \t\n\\");
-       seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+       seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
-                       si->pages << (PAGE_SHIFT - 10),
-                       si->inuse_pages << (PAGE_SHIFT - 10),
+                       bytes, bytes < 10000000 ? "\t" : "",
+                       inuse, inuse < 10000000 ? "\t" : "",
                        si->prio);
        return 0;
 }
@@ -2893,7 +2929,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
                 * write only restriction.  Hence zoned block devices are not
                 * suitable for swapping.  Disallow them here.
                 */
-               if (blk_queue_is_zoned(p->bdev->bd_queue))
+               if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
                        return -EINVAL;
                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
@@ -3202,11 +3238,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                unsigned long ci, nr_cluster;
 
                p->flags |= SWP_SOLIDSTATE;
+               p->cluster_next_cpu = alloc_percpu(unsigned int);
+               if (!p->cluster_next_cpu) {
+                       error = -ENOMEM;
+                       goto bad_swap_unlock_inode;
+               }
                /*
                 * select a random position to start with to help wear leveling
                 * SSD
                 */
-               p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+               for_each_possible_cpu(cpu) {
+                       per_cpu(*p->cluster_next_cpu, cpu) =
+                               1 + prandom_u32_max(p->highest_bit);
+               }
                nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
                cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
@@ -3322,6 +3366,8 @@ bad_swap_unlock_inode:
 bad_swap:
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                set_blocksize(p->bdev, p->old_block_size);
                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -3654,7 +3700,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
 
        spin_lock(&si->cont_lock);
        offset &= ~PAGE_MASK;
-       page = list_entry(head->lru.next, struct page, lru);
+       page = list_next_entry(head, lru);
        map = kmap_atomic(page) + offset;
 
        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
@@ -3666,13 +3712,13 @@ static bool swap_count_continued(struct swap_info_struct *si,
                 */
                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
                        kunmap_atomic(map);
-                       page = list_entry(page->lru.next, struct page, lru);
+                       page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_atomic(page) + offset;
                }
                if (*map == SWAP_CONT_MAX) {
                        kunmap_atomic(map);
-                       page = list_entry(page->lru.next, struct page, lru);
+                       page = list_next_entry(page, lru);
                        if (page == head) {
                                ret = false;    /* add count continuation */
                                goto out;
@@ -3682,12 +3728,10 @@ init_map:               *map = 0;               /* we didn't zero the page */
                }
                *map += 1;
                kunmap_atomic(map);
-               page = list_entry(page->lru.prev, struct page, lru);
-               while (page != head) {
+               while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_atomic(page) + offset;
                        *map = COUNT_CONTINUED;
                        kunmap_atomic(map);
-                       page = list_entry(page->lru.prev, struct page, lru);
                }
                ret = true;                     /* incremented */
 
@@ -3698,7 +3742,7 @@ init_map:         *map = 0;               /* we didn't zero the page */
                BUG_ON(count != COUNT_CONTINUED);
                while (*map == COUNT_CONTINUED) {
                        kunmap_atomic(map);
-                       page = list_entry(page->lru.next, struct page, lru);
+                       page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_atomic(page) + offset;
                }
@@ -3707,13 +3751,11 @@ init_map:               *map = 0;               /* we didn't zero the page */
                if (*map == 0)
                        count = 0;
                kunmap_atomic(map);
-               page = list_entry(page->lru.prev, struct page, lru);
-               while (page != head) {
+               while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_atomic(page) + offset;
                        *map = SWAP_CONT_MAX | count;
                        count = COUNT_CONTINUED;
                        kunmap_atomic(map);
-                       page = list_entry(page->lru.prev, struct page, lru);
                }
                ret = count == COUNT_CONTINUED;
        }
@@ -3745,11 +3787,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
-                                 gfp_t gfp_mask)
+void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
 {
        struct swap_info_struct *si, *next;
-       if (!(gfp_mask & __GFP_IO) || !memcg)
+       int nid = page_to_nid(page);
+
+       if (!(gfp_mask & __GFP_IO))
                return;
 
        if (!blk_cgroup_congested())
@@ -3763,11 +3806,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
                return;
 
        spin_lock(&swap_avail_lock);
-       plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
-                                 avail_lists[node]) {
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
+                                 avail_lists[nid]) {
                if (si->bdev) {
-                       blkcg_schedule_throttle(bdev_get_queue(si->bdev),
-                                               true);
+                       blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
                        break;
                }
        }