powerpc/pseries/cmm: Switch to balloon_page_alloc()
[linux-2.6-microblaze.git] / arch / powerpc / platforms / pseries / cmm.c
index b33251d..86eb845 100644 (file)
 #include <linux/stringify.h>
 #include <linux/swap.h>
 #include <linux/device.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/magic.h>
+#include <linux/balloon_compaction.h>
 #include <asm/firmware.h>
 #include <asm/hvcall.h>
 #include <asm/mmu.h>
 #define CMM_MIN_MEM_MB         256
 #define KB2PAGES(_p)           ((_p)>>(PAGE_SHIFT-10))
 #define PAGES2KB(_p)           ((_p)<<(PAGE_SHIFT-10))
-/*
- * The priority level tries to ensure that this notifier is called as
- * late as possible to reduce thrashing in the shared memory pool.
- */
+
 #define CMM_MEM_HOTPLUG_PRI    1
-#define CMM_MEM_ISOLATE_PRI    15
 
 static unsigned int delay = CMM_DEFAULT_DELAY;
 static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
@@ -75,30 +75,21 @@ module_param_named(debug, cmm_debug, uint, 0644);
 MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
                 "[Default=" __stringify(CMM_DEBUG) "]");
 
-#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
-
 #define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
 
-struct cmm_page_array {
-       struct cmm_page_array *next;
-       unsigned long index;
-       unsigned long page[CMM_NR_PAGES];
-};
-
-static unsigned long loaned_pages;
+static atomic_long_t loaned_pages;
 static unsigned long loaned_pages_target;
 static unsigned long oom_freed_pages;
 
-static struct cmm_page_array *cmm_page_list;
-static DEFINE_SPINLOCK(cmm_lock);
-
 static DEFINE_MUTEX(hotplug_mutex);
 static int hotplug_occurred; /* protected by the hotplug mutex */
 
 static struct task_struct *cmm_thread_ptr;
+static struct balloon_dev_info b_dev_info;
 
-static long plpar_page_set_loaned(unsigned long vpa)
+static long plpar_page_set_loaned(struct page *page)
 {
+       const unsigned long vpa = page_to_phys(page);
        unsigned long cmo_page_sz = cmo_get_page_size();
        long rc = 0;
        int i;
@@ -113,8 +104,9 @@ static long plpar_page_set_loaned(unsigned long vpa)
        return rc;
 }
 
-static long plpar_page_set_active(unsigned long vpa)
+static long plpar_page_set_active(struct page *page)
 {
+       const unsigned long vpa = page_to_phys(page);
        unsigned long cmo_page_sz = cmo_get_page_size();
        long rc = 0;
        int i;
@@ -138,8 +130,7 @@ static long plpar_page_set_active(unsigned long vpa)
  **/
 static long cmm_alloc_pages(long nr)
 {
-       struct cmm_page_array *pa, *npa;
-       unsigned long addr;
+       struct page *page;
        long rc;
 
        cmm_dbg("Begin request for %ld pages\n", nr);
@@ -156,46 +147,19 @@ static long cmm_alloc_pages(long nr)
                        break;
                }
 
-               addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
-                                      __GFP_NORETRY | __GFP_NOMEMALLOC);
-               if (!addr)
+               page = balloon_page_alloc();
+               if (!page)
                        break;
-               spin_lock(&cmm_lock);
-               pa = cmm_page_list;
-               if (!pa || pa->index >= CMM_NR_PAGES) {
-                       /* Need a new page for the page list. */
-                       spin_unlock(&cmm_lock);
-                       npa = (struct cmm_page_array *)__get_free_page(
-                                       GFP_NOIO | __GFP_NOWARN |
-                                       __GFP_NORETRY | __GFP_NOMEMALLOC);
-                       if (!npa) {
-                               pr_info("%s: Can not allocate new page list\n", __func__);
-                               free_page(addr);
-                               break;
-                       }
-                       spin_lock(&cmm_lock);
-                       pa = cmm_page_list;
-
-                       if (!pa || pa->index >= CMM_NR_PAGES) {
-                               npa->next = pa;
-                               npa->index = 0;
-                               pa = npa;
-                               cmm_page_list = pa;
-                       } else
-                               free_page((unsigned long) npa);
-               }
-
-               if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+               rc = plpar_page_set_loaned(page);
+               if (rc) {
                        pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
-                       spin_unlock(&cmm_lock);
-                       free_page(addr);
+                       __free_page(page);
                        break;
                }
 
-               pa->page[pa->index++] = addr;
-               loaned_pages++;
-               totalram_pages_dec();
-               spin_unlock(&cmm_lock);
+               balloon_page_enqueue(&b_dev_info, page);
+               atomic_long_inc(&loaned_pages);
+               adjust_managed_page_count(page, -1);
                nr--;
        }
 
@@ -212,30 +176,19 @@ static long cmm_alloc_pages(long nr)
  **/
 static long cmm_free_pages(long nr)
 {
-       struct cmm_page_array *pa;
-       unsigned long addr;
+       struct page *page;
 
        cmm_dbg("Begin free of %ld pages.\n", nr);
-       spin_lock(&cmm_lock);
-       pa = cmm_page_list;
        while (nr) {
-               if (!pa || pa->index <= 0)
+               page = balloon_page_dequeue(&b_dev_info);
+               if (!page)
                        break;
-               addr = pa->page[--pa->index];
-
-               if (pa->index == 0) {
-                       pa = pa->next;
-                       free_page((unsigned long) cmm_page_list);
-                       cmm_page_list = pa;
-               }
-
-               plpar_page_set_active(__pa(addr));
-               free_page(addr);
-               loaned_pages--;
+               plpar_page_set_active(page);
+               adjust_managed_page_count(page, 1);
+               __free_page(page);
+               atomic_long_dec(&loaned_pages);
                nr--;
-               totalram_pages_inc();
        }
-       spin_unlock(&cmm_lock);
        cmm_dbg("End request with %ld pages unfulfilled\n", nr);
        return nr;
 }
@@ -257,7 +210,7 @@ static int cmm_oom_notify(struct notifier_block *self,
 
        cmm_dbg("OOM processing started\n");
        nr = cmm_free_pages(nr);
-       loaned_pages_target = loaned_pages;
+       loaned_pages_target = atomic_long_read(&loaned_pages);
        *freed += KB2PAGES(oom_kb) - nr;
        oom_freed_pages += KB2PAGES(oom_kb) - nr;
        cmm_dbg("OOM processing complete\n");
@@ -274,10 +227,11 @@ static int cmm_oom_notify(struct notifier_block *self,
  **/
 static void cmm_get_mpp(void)
 {
+       const long __loaned_pages = atomic_long_read(&loaned_pages);
+       const long total_pages = totalram_pages() + __loaned_pages;
        int rc;
        struct hvcall_mpp_data mpp_data;
        signed long active_pages_target, page_loan_request, target;
-       signed long total_pages = totalram_pages() + loaned_pages;
        signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
 
        rc = h_get_mpp(&mpp_data);
@@ -286,7 +240,7 @@ static void cmm_get_mpp(void)
                return;
 
        page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
-       target = page_loan_request + (signed long)loaned_pages;
+       target = page_loan_request + __loaned_pages;
 
        if (target < 0 || total_pages < min_mem_pages)
                target = 0;
@@ -307,7 +261,7 @@ static void cmm_get_mpp(void)
        loaned_pages_target = target;
 
        cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
-               page_loan_request, loaned_pages, loaned_pages_target,
+               page_loan_request, __loaned_pages, loaned_pages_target,
                oom_freed_pages, totalram_pages());
 }
 
@@ -325,6 +279,7 @@ static struct notifier_block cmm_oom_nb = {
 static int cmm_thread(void *dummy)
 {
        unsigned long timeleft;
+       long __loaned_pages;
 
        while (1) {
                timeleft = msleep_interruptible(delay * 1000);
@@ -355,11 +310,12 @@ static int cmm_thread(void *dummy)
 
                cmm_get_mpp();
 
-               if (loaned_pages_target > loaned_pages) {
-                       if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
-                               loaned_pages_target = loaned_pages;
-               } else if (loaned_pages_target < loaned_pages)
-                       cmm_free_pages(loaned_pages - loaned_pages_target);
+               __loaned_pages = atomic_long_read(&loaned_pages);
+               if (loaned_pages_target > __loaned_pages) {
+                       if (cmm_alloc_pages(loaned_pages_target - __loaned_pages))
+                               loaned_pages_target = __loaned_pages;
+               } else if (loaned_pages_target < __loaned_pages)
+                       cmm_free_pages(__loaned_pages - loaned_pages_target);
        }
        return 0;
 }
@@ -373,7 +329,7 @@ static int cmm_thread(void *dummy)
        }                                                       \
        static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
-CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(atomic_long_read(&loaned_pages)));
 CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
 
 static ssize_t show_oom_pages(struct device *dev,
@@ -411,6 +367,10 @@ static struct bus_type cmm_subsys = {
        .dev_name = "cmm",
 };
 
+static void cmm_release_device(struct device *dev)
+{
+}
+
 /**
  * cmm_sysfs_register - Register with sysfs
  *
@@ -426,6 +386,7 @@ static int cmm_sysfs_register(struct device *dev)
 
        dev->id = 0;
        dev->bus = &cmm_subsys;
+       dev->release = cmm_release_device;
 
        if ((rc = device_register(dev)))
                goto subsys_unregister;
@@ -471,7 +432,7 @@ static int cmm_reboot_notifier(struct notifier_block *nb,
                if (cmm_thread_ptr)
                        kthread_stop(cmm_thread_ptr);
                cmm_thread_ptr = NULL;
-               cmm_free_pages(loaned_pages);
+               cmm_free_pages(atomic_long_read(&loaned_pages));
        }
        return NOTIFY_DONE;
 }
@@ -480,142 +441,6 @@ static struct notifier_block cmm_reboot_nb = {
        .notifier_call = cmm_reboot_notifier,
 };
 
-/**
- * cmm_count_pages - Count the number of pages loaned in a particular range.
- *
- * @arg: memory_isolate_notify structure with address range and count
- *
- * Return value:
- *      0 on success
- **/
-static unsigned long cmm_count_pages(void *arg)
-{
-       struct memory_isolate_notify *marg = arg;
-       struct cmm_page_array *pa;
-       unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
-       unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
-       unsigned long idx;
-
-       spin_lock(&cmm_lock);
-       pa = cmm_page_list;
-       while (pa) {
-               if ((unsigned long)pa >= start && (unsigned long)pa < end)
-                       marg->pages_found++;
-               for (idx = 0; idx < pa->index; idx++)
-                       if (pa->page[idx] >= start && pa->page[idx] < end)
-                               marg->pages_found++;
-               pa = pa->next;
-       }
-       spin_unlock(&cmm_lock);
-       return 0;
-}
-
-/**
- * cmm_memory_isolate_cb - Handle memory isolation notifier calls
- * @self:      notifier block struct
- * @action:    action to take
- * @arg:       struct memory_isolate_notify data for handler
- *
- * Return value:
- *     NOTIFY_OK or notifier error based on subfunction return value
- **/
-static int cmm_memory_isolate_cb(struct notifier_block *self,
-                                unsigned long action, void *arg)
-{
-       int ret = 0;
-
-       if (action == MEM_ISOLATE_COUNT)
-               ret = cmm_count_pages(arg);
-
-       return notifier_from_errno(ret);
-}
-
-static struct notifier_block cmm_mem_isolate_nb = {
-       .notifier_call = cmm_memory_isolate_cb,
-       .priority = CMM_MEM_ISOLATE_PRI
-};
-
-/**
- * cmm_mem_going_offline - Unloan pages where memory is to be removed
- * @arg: memory_notify structure with page range to be offlined
- *
- * Return value:
- *     0 on success
- **/
-static int cmm_mem_going_offline(void *arg)
-{
-       struct memory_notify *marg = arg;
-       unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
-       unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
-       struct cmm_page_array *pa_curr, *pa_last, *npa;
-       unsigned long idx;
-       unsigned long freed = 0;
-
-       cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
-                       start_page, marg->nr_pages);
-       spin_lock(&cmm_lock);
-
-       /* Search the page list for pages in the range to be offlined */
-       pa_last = pa_curr = cmm_page_list;
-       while (pa_curr) {
-               for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
-                       if ((pa_curr->page[idx] < start_page) ||
-                           (pa_curr->page[idx] >= end_page))
-                               continue;
-
-                       plpar_page_set_active(__pa(pa_curr->page[idx]));
-                       free_page(pa_curr->page[idx]);
-                       freed++;
-                       loaned_pages--;
-                       totalram_pages_inc();
-                       pa_curr->page[idx] = pa_last->page[--pa_last->index];
-                       if (pa_last->index == 0) {
-                               if (pa_curr == pa_last)
-                                       pa_curr = pa_last->next;
-                               pa_last = pa_last->next;
-                               free_page((unsigned long)cmm_page_list);
-                               cmm_page_list = pa_last;
-                       }
-               }
-               pa_curr = pa_curr->next;
-       }
-
-       /* Search for page list structures in the range to be offlined */
-       pa_last = NULL;
-       pa_curr = cmm_page_list;
-       while (pa_curr) {
-               if (((unsigned long)pa_curr >= start_page) &&
-                               ((unsigned long)pa_curr < end_page)) {
-                       npa = (struct cmm_page_array *)__get_free_page(
-                                       GFP_NOIO | __GFP_NOWARN |
-                                       __GFP_NORETRY | __GFP_NOMEMALLOC);
-                       if (!npa) {
-                               spin_unlock(&cmm_lock);
-                               cmm_dbg("Failed to allocate memory for list "
-                                               "management. Memory hotplug "
-                                               "failed.\n");
-                               return -ENOMEM;
-                       }
-                       memcpy(npa, pa_curr, PAGE_SIZE);
-                       if (pa_curr == cmm_page_list)
-                               cmm_page_list = npa;
-                       if (pa_last)
-                               pa_last->next = npa;
-                       free_page((unsigned long) pa_curr);
-                       freed++;
-                       pa_curr = npa;
-               }
-
-               pa_last = pa_curr;
-               pa_curr = pa_curr->next;
-       }
-
-       spin_unlock(&cmm_lock);
-       cmm_dbg("Released %ld pages in the search range.\n", freed);
-
-       return 0;
-}
-
 /**
  * cmm_memory_cb - Handle memory hotplug notifier calls
  * @self:      notifier block struct
@@ -635,7 +460,6 @@ static int cmm_memory_cb(struct notifier_block *self,
        case MEM_GOING_OFFLINE:
                mutex_lock(&hotplug_mutex);
                hotplug_occurred = 1;
-               ret = cmm_mem_going_offline(arg);
                break;
        case MEM_OFFLINE:
        case MEM_CANCEL_OFFLINE:
@@ -656,6 +480,106 @@ static struct notifier_block cmm_mem_nb = {
        .priority = CMM_MEM_HOTPLUG_PRI
 };
 
+#ifdef CONFIG_BALLOON_COMPACTION
+static struct vfsmount *balloon_mnt;
+
+static int cmm_init_fs_context(struct fs_context *fc)
+{
+       return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type balloon_fs = {
+       .name = "ppc-cmm",
+       .init_fs_context = cmm_init_fs_context,
+       .kill_sb = kill_anon_super,
+};
+
+static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
+                          struct page *newpage, struct page *page,
+                          enum migrate_mode mode)
+{
+       unsigned long flags;
+
+       /*
+        * loan/"inflate" the newpage first.
+        *
+        * We might race against the cmm_thread who might discover after our
+        * loan request that another page is to be unloaned. However, once
+        * the cmm_thread runs again later, this error will automatically
+        * be corrected.
+        */
+       if (plpar_page_set_loaned(newpage)) {
+               /* Unlikely, but possible. Tell the caller not to retry now. */
+               pr_err_ratelimited("%s: Cannot set page to loaned.", __func__);
+               return -EBUSY;
+       }
+
+       /* balloon page list reference */
+       get_page(newpage);
+
+       spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+       balloon_page_insert(b_dev_info, newpage);
+       balloon_page_delete(page);
+       b_dev_info->isolated_pages--;
+       spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+       /*
+        * activate/"deflate" the old page. We ignore any errors just like the
+        * other callers.
+        */
+       plpar_page_set_active(page);
+
+       /* balloon page list reference */
+       put_page(page);
+
+       return MIGRATEPAGE_SUCCESS;
+}
+
+static int cmm_balloon_compaction_init(void)
+{
+       int rc;
+
+       balloon_devinfo_init(&b_dev_info);
+       b_dev_info.migratepage = cmm_migratepage;
+
+       balloon_mnt = kern_mount(&balloon_fs);
+       if (IS_ERR(balloon_mnt)) {
+               rc = PTR_ERR(balloon_mnt);
+               balloon_mnt = NULL;
+               return rc;
+       }
+
+       b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+       if (IS_ERR(b_dev_info.inode)) {
+               rc = PTR_ERR(b_dev_info.inode);
+               b_dev_info.inode = NULL;
+               kern_unmount(balloon_mnt);
+               balloon_mnt = NULL;
+               return rc;
+       }
+
+       b_dev_info.inode->i_mapping->a_ops = &balloon_aops;
+       return 0;
+}
+static void cmm_balloon_compaction_deinit(void)
+{
+       if (b_dev_info.inode)
+               iput(b_dev_info.inode);
+       b_dev_info.inode = NULL;
+       kern_unmount(balloon_mnt);
+       balloon_mnt = NULL;
+}
+#else /* CONFIG_BALLOON_COMPACTION */
+static int cmm_balloon_compaction_init(void)
+{
+       return 0;
+}
+
+static void cmm_balloon_compaction_deinit(void)
+{
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
+
 /**
  * cmm_init - Module initialization
  *
@@ -664,26 +588,31 @@ static struct notifier_block cmm_mem_nb = {
  **/
 static int cmm_init(void)
 {
-       int rc = -ENOMEM;
+       int rc;
 
        if (!firmware_has_feature(FW_FEATURE_CMO))
                return -EOPNOTSUPP;
 
-       if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+       rc = cmm_balloon_compaction_init();
+       if (rc)
                return rc;
 
+       rc = register_oom_notifier(&cmm_oom_nb);
+       if (rc < 0)
+               goto out_balloon_compaction;
+
        if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
                goto out_oom_notifier;
 
        if ((rc = cmm_sysfs_register(&cmm_dev)))
                goto out_reboot_notifier;
 
-       if (register_memory_notifier(&cmm_mem_nb) ||
-           register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+       rc = register_memory_notifier(&cmm_mem_nb);
+       if (rc)
                goto out_unregister_notifier;
 
        if (cmm_disabled)
-               return rc;
+               return 0;
 
        cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
        if (IS_ERR(cmm_thread_ptr)) {
@@ -691,16 +620,16 @@ static int cmm_init(void)
                goto out_unregister_notifier;
        }
 
-       return rc;
-
+       return 0;
 out_unregister_notifier:
        unregister_memory_notifier(&cmm_mem_nb);
-       unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
        cmm_unregister_sysfs(&cmm_dev);
 out_reboot_notifier:
        unregister_reboot_notifier(&cmm_reboot_nb);
 out_oom_notifier:
        unregister_oom_notifier(&cmm_oom_nb);
+out_balloon_compaction:
+       cmm_balloon_compaction_deinit();
        return rc;
 }
 
@@ -717,9 +646,9 @@ static void cmm_exit(void)
        unregister_oom_notifier(&cmm_oom_nb);
        unregister_reboot_notifier(&cmm_reboot_nb);
        unregister_memory_notifier(&cmm_mem_nb);
-       unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
-       cmm_free_pages(loaned_pages);
+       cmm_free_pages(atomic_long_read(&loaned_pages));
        cmm_unregister_sysfs(&cmm_dev);
+       cmm_balloon_compaction_deinit();
 }
 
 /**
@@ -739,7 +668,7 @@ static int cmm_set_disable(const char *val, const struct kernel_param *kp)
                if (cmm_thread_ptr)
                        kthread_stop(cmm_thread_ptr);
                cmm_thread_ptr = NULL;
-               cmm_free_pages(loaned_pages);
+               cmm_free_pages(atomic_long_read(&loaned_pages));
        } else if (!disable && cmm_disabled) {
                cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
                if (IS_ERR(cmm_thread_ptr))