console: consume APC, DM, DCS
[linux-2.6-microblaze.git] / mm / swapfile.c
index 996afa8..1e07d1c 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/export.h>
 #include <linux/swap_slots.h>
 #include <linux/sort.h>
+#include <linux/completion.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
@@ -99,11 +100,10 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0);
 
 static struct swap_info_struct *swap_type_to_swap_info(int type)
 {
-       if (type >= READ_ONCE(nr_swapfiles))
+       if (type >= MAX_SWAPFILES)
                return NULL;
 
-       smp_rmb();      /* Pairs with smp_wmb in alloc_swap_info. */
-       return READ_ONCE(swap_info[type]);
+       return READ_ONCE(swap_info[type]); /* rcu_dereference() */
 }
 
 static inline unsigned char swap_count(unsigned char ent)
@@ -452,10 +452,10 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
                unsigned int idx)
 {
        /*
-        * If scan_swap_map() can't find a free cluster, it will check
+        * If scan_swap_map_slots() can't find a free cluster, it will check
         * si->swap_map directly. To make sure the discarding cluster isn't
-        * taken by scan_swap_map(), mark the swap entries bad (occupied). It
-        * will be cleared after discard
+        * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
+        * It will be cleared after discard
         */
        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);
@@ -511,6 +511,14 @@ static void swap_discard_work(struct work_struct *work)
        spin_unlock(&si->lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+       struct swap_info_struct *si;
+
+       si = container_of(ref, struct swap_info_struct, users);
+       complete(&si->comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
        struct swap_cluster_info *ci = si->cluster_info;
@@ -580,7 +588,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 }
 
 /*
- * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
  * cluster list. Avoiding such abuse to avoid list corruption.
  */
 static bool
@@ -1028,21 +1036,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
        swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
 
-static unsigned long scan_swap_map(struct swap_info_struct *si,
-                                  unsigned char usage)
-{
-       swp_entry_t entry;
-       int n_ret;
-
-       n_ret = scan_swap_map_slots(si, usage, 1, &entry);
-
-       if (n_ret)
-               return swp_offset(entry);
-       else
-               return 0;
-
-}
-
 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 {
        unsigned long size = swap_entry_size(entry_size);
@@ -1105,14 +1098,14 @@ start_over:
 nextsi:
                /*
                 * if we got here, it's likely that si was almost full before,
-                * and since scan_swap_map() can drop the si->lock, multiple
-                * callers probably all tried to get a page from the same si
-                * and it filled up before we could get one; or, the si filled
-                * up between us dropping swap_avail_lock and taking si->lock.
-                * Since we dropped the swap_avail_lock, the swap_avail_head
-                * list may have been modified; so if next is still in the
-                * swap_avail_head list then try it, otherwise start over
-                * if we have not gotten any slots.
+                * and since scan_swap_map_slots() can drop the si->lock,
+                * multiple callers probably all tried to get a page from the
+                * same si and it filled up before we could get one; or, the si
+                * filled up between us dropping swap_avail_lock and taking
+                * si->lock. Since we dropped the swap_avail_lock, the
+                * swap_avail_head list may have been modified; so if next is
+                * still in the swap_avail_head list then try it, otherwise
+                * start over if we have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_lists[node]))
                        goto start_over;
@@ -1128,30 +1121,6 @@ noswap:
        return n_ret;
 }
 
-/* The only caller of this function is now suspend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
-       struct swap_info_struct *si = swap_type_to_swap_info(type);
-       pgoff_t offset;
-
-       if (!si)
-               goto fail;
-
-       spin_lock(&si->lock);
-       if (si->flags & SWP_WRITEOK) {
-               /* This is called for allocating swap entry, not cache */
-               offset = scan_swap_map(si, 1);
-               if (offset) {
-                       atomic_long_dec(&nr_swap_pages);
-                       spin_unlock(&si->lock);
-                       return swp_entry(type, offset);
-               }
-       }
-       spin_unlock(&si->lock);
-fail:
-       return (swp_entry_t) {0};
-}
-
 static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct *p;
@@ -1270,18 +1239,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1                              CPU2
  *   do_swap_page()
@@ -1309,21 +1272,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
        si = swp_swap_info(entry);
        if (!si)
                goto bad_nofile;
-
-       rcu_read_lock();
-       if (data_race(!(si->flags & SWP_VALID)))
-               goto unlock_out;
+       if (!percpu_ref_tryget_live(&si->users))
+               goto out;
+       /*
+        * Guarantee the si->users are checked before accessing other
+        * fields of swap_info_struct.
+        *
+        * Paired with the spin_unlock() after setup_swap_info() in
+        * enable_swap_info().
+        */
+       smp_rmb();
        offset = swp_offset(entry);
        if (offset >= si->max)
-               goto unlock_out;
+               goto put_out;
 
        return si;
 bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
        return NULL;
-unlock_out:
-       rcu_read_unlock();
+put_out:
+       percpu_ref_put(&si->users);
        return NULL;
 }
 
@@ -1803,6 +1772,24 @@ int free_swap_and_cache(swp_entry_t entry)
 }
 
 #ifdef CONFIG_HIBERNATION
+
+swp_entry_t get_swap_page_of_type(int type)
+{
+       struct swap_info_struct *si = swap_type_to_swap_info(type);
+       swp_entry_t entry = {0};
+
+       if (!si)
+               goto fail;
+
+       /* This is called for allocating swap entry, not cache */
+       spin_lock(&si->lock);
+       if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
+               atomic_long_dec(&nr_swap_pages);
+       spin_unlock(&si->lock);
+fail:
+       return entry;
+}
+
 /*
  * Find the swap type that corresponds to given device (if any).
  *
@@ -2466,7 +2453,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
 
 static void _enable_swap_info(struct swap_info_struct *p)
 {
-       p->flags |= SWP_WRITEOK | SWP_VALID;
+       p->flags |= SWP_WRITEOK;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
 
@@ -2497,10 +2484,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        /*
-        * Guarantee swap_map, cluster_info, etc. fields are valid
-        * between get/put_swap_device() if SWP_VALID bit is set
+        * Finished initializing swap device, now it's safe to reference it.
         */
-       synchronize_rcu();
+       percpu_ref_resurrect(&p->users);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        _enable_swap_info(p);
@@ -2616,16 +2602,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
        reenable_swap_slots_cache_unlock();
 
-       spin_lock(&swap_lock);
-       spin_lock(&p->lock);
-       p->flags &= ~SWP_VALID;         /* mark swap device as invalid */
-       spin_unlock(&p->lock);
-       spin_unlock(&swap_lock);
        /*
-        * wait for swap operations protected by get/put_swap_device()
-        * to complete
+        * Wait for swap operations protected by get/put_swap_device()
+        * to complete.
+        *
+        * We need synchronize_rcu() here to protect the accessing to
+        * the swap cache data structure.
         */
+       percpu_ref_kill(&p->users);
        synchronize_rcu();
+       wait_for_completion(&p->comp);
 
        flush_work(&p->discard_work);
 
@@ -2641,7 +2627,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_lock(&p->lock);
        drain_mmlist();
 
-       /* wait for anyone still in scan_swap_map */
+       /* wait for anyone still in scan_swap_map_slots */
        p->highest_bit = 0;             /* cuts scans short */
        while (p->flags >= SWP_SCANNING) {
                spin_unlock(&p->lock);
@@ -2857,6 +2843,12 @@ static struct swap_info_struct *alloc_swap_info(void)
        if (!p)
                return ERR_PTR(-ENOMEM);
 
+       if (percpu_ref_init(&p->users, swap_users_ref_free,
+                           PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
+               kvfree(p);
+               return ERR_PTR(-ENOMEM);
+       }
+
        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                if (!(swap_info[type]->flags & SWP_USED))
@@ -2864,19 +2856,18 @@ static struct swap_info_struct *alloc_swap_info(void)
        }
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
+               percpu_ref_exit(&p->users);
                kvfree(p);
                return ERR_PTR(-EPERM);
        }
        if (type >= nr_swapfiles) {
                p->type = type;
-               WRITE_ONCE(swap_info[type], p);
                /*
-                * Write swap_info[type] before nr_swapfiles, in case a
-                * racing procfs swap_start() or swap_next() is reading them.
-                * (We never shrink nr_swapfiles, we never free this entry.)
+                * Publish the swap_info_struct after initializing it.
+                * Note that kvzalloc() above zeroes all its fields.
                 */
-               smp_wmb();
-               WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
+               smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
+               nr_swapfiles++;
        } else {
                defer = p;
                p = swap_info[type];
@@ -2891,9 +2882,13 @@ static struct swap_info_struct *alloc_swap_info(void)
                plist_node_init(&p->avail_lists[i], 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
-       kvfree(defer);
+       if (defer) {
+               percpu_ref_exit(&defer->users);
+               kvfree(defer);
+       }
        spin_lock_init(&p->lock);
        spin_lock_init(&p->cont_lock);
+       init_completion(&p->comp);
 
        return p;
 }
@@ -2972,7 +2967,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
                return 0;
        }
 
-       /* swap partition endianess hack... */
+       /* swap partition endianness hack... */
        if (swab32(swap_header->info.version) == 1) {
                swab32s(&swap_header->info.version);
                swab32s(&swap_header->info.last_page);