drivers/base/memory.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Memory subsystem support
   4  *
   5  * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
   6  *            Dave Hansen <haveblue@us.ibm.com>
   7  *
   8  * This file provides the necessary infrastructure to represent
   9  * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10  * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11  * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12  */
  13
  14 #include <linux/module.h>
  15 #include <linux/init.h>
  16 #include <linux/topology.h>
  17 #include <linux/capability.h>
  18 #include <linux/device.h>
  19 #include <linux/memory.h>
  20 #include <linux/memory_hotplug.h>
  21 #include <linux/mm.h>
  22 #include <linux/stat.h>
  23 #include <linux/slab.h>
  24 #include <linux/xarray.h>
  25
  26 #include <linux/atomic.h>
  27 #include <linux/uaccess.h>
  28
  29 #define MEMORY_CLASS_NAME       "memory"
  30
  31 static const char *const online_type_to_str[] = {
  32         [MMOP_OFFLINE] = "offline",
  33         [MMOP_ONLINE] = "online",
  34         [MMOP_ONLINE_KERNEL] = "online_kernel",
  35         [MMOP_ONLINE_MOVABLE] = "online_movable",
  36 };
  37
  38 int mhp_online_type_from_str(const char *str)
  39 {
  40         int i;
  41
  42         for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  43                 if (sysfs_streq(str, online_type_to_str[i]))
  44                         return i;
  45         }
  46         return -EINVAL;
  47 }
  48
  49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  50
  51 static int sections_per_block;
  52
  53 static inline unsigned long memory_block_id(unsigned long section_nr)
  54 {
  55         return section_nr / sections_per_block;
  56 }
  57
  58 static inline unsigned long pfn_to_block_id(unsigned long pfn)
  59 {
  60         return memory_block_id(pfn_to_section_nr(pfn));
  61 }
  62
  63 static inline unsigned long phys_to_block_id(unsigned long phys)
  64 {
  65         return pfn_to_block_id(PFN_DOWN(phys));
  66 }
  67
  68 static int memory_subsys_online(struct device *dev);
  69 static int memory_subsys_offline(struct device *dev);
  70
  71 static const struct bus_type memory_subsys = {
  72         .name = MEMORY_CLASS_NAME,
  73         .dev_name = MEMORY_CLASS_NAME,
  74         .online = memory_subsys_online,
  75         .offline = memory_subsys_offline,
  76 };
  77
  78 /*
  79  * Memory blocks are cached in a local radix tree to avoid
  80  * a costly linear search for the corresponding device on
  81  * the subsystem bus.
  82  */
  83 static DEFINE_XARRAY(memory_blocks);
  84
  85 /*
  86  * Memory groups, indexed by memory group id (mgid).
  87  */
  88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  89 #define MEMORY_GROUP_MARK_DYNAMIC       XA_MARK_1
  90
  91 static BLOCKING_NOTIFIER_HEAD(memory_chain);
  92
  93 int register_memory_notifier(struct notifier_block *nb)
  94 {
  95         return blocking_notifier_chain_register(&memory_chain, nb);
  96 }
  97 EXPORT_SYMBOL(register_memory_notifier);
  98
  99 void unregister_memory_notifier(struct notifier_block *nb)
 100 {
 101         blocking_notifier_chain_unregister(&memory_chain, nb);
 102 }
 103 EXPORT_SYMBOL(unregister_memory_notifier);
 104
 105 static void memory_block_release(struct device *dev)
 106 {
 107         struct memory_block *mem = to_memory_block(dev);
 108         /* Verify that the altmap is freed */
 109         WARN_ON(mem->altmap);
 110         kfree(mem);
 111 }
 112
 113 unsigned long __weak memory_block_size_bytes(void)
 114 {
 115         return MIN_MEMORY_BLOCK_SIZE;
 116 }
 117 EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 118
 119 /* Show the memory block ID, relative to the memory block size */
 120 static ssize_t phys_index_show(struct device *dev,
 121                                struct device_attribute *attr, char *buf)
 122 {
 123         struct memory_block *mem = to_memory_block(dev);
 124
 125         return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr));
 126 }
 127
 128 /*
 129  * Legacy interface that we cannot remove. Always indicate "removable"
 130  * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
 131  */
 132 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 133                               char *buf)
 134 {
 135         return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
 136 }
 137
 138 /*
 139  * online, offline, going offline, etc.
 140  */
 141 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 142                           char *buf)
 143 {
 144         struct memory_block *mem = to_memory_block(dev);
 145         const char *output;
 146
 147         /*
 148          * We can probably put these states in a nice little array
 149          * so that they're not open-coded
 150          */
 151         switch (mem->state) {
 152         case MEM_ONLINE:
 153                 output = "online";
 154                 break;
 155         case MEM_OFFLINE:
 156                 output = "offline";
 157                 break;
 158         case MEM_GOING_OFFLINE:
 159                 output = "going-offline";
 160                 break;
 161         default:
 162                 WARN_ON(1);
 163                 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
 164         }
 165
 166         return sysfs_emit(buf, "%s\n", output);
 167 }
 168
 169 int memory_notify(unsigned long val, void *v)
 170 {
 171         return blocking_notifier_call_chain(&memory_chain, val, v);
 172 }
 173
 174 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 175 static unsigned long memblk_nr_poison(struct memory_block *mem);
 176 #else
 177 static inline unsigned long memblk_nr_poison(struct memory_block *mem)
 178 {
 179         return 0;
 180 }
 181 #endif
 182
 183 /*
 184  * Must acquire mem_hotplug_lock in write mode.
 185  */
 186 static int memory_block_online(struct memory_block *mem)
 187 {
 188         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 189         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 190         unsigned long nr_vmemmap_pages = 0;
 191         struct memory_notify arg;
 192         struct zone *zone;
 193         int ret;
 194
 195         if (memblk_nr_poison(mem))
 196                 return -EHWPOISON;
 197
 198         zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
 199                                   start_pfn, nr_pages);
 200
 201         /*
 202          * Although vmemmap pages have a different lifecycle than the pages
 203          * they describe (they remain until the memory is unplugged), doing
 204          * their initialization and accounting at memory onlining/offlining
 205          * stage helps to keep accounting easier to follow - e.g vmemmaps
 206          * belong to the same zone as the memory they backed.
 207          */
 208         if (mem->altmap)
 209                 nr_vmemmap_pages = mem->altmap->free;
 210
 211         arg.altmap_start_pfn = start_pfn;
 212         arg.altmap_nr_pages = nr_vmemmap_pages;
 213         arg.start_pfn = start_pfn + nr_vmemmap_pages;
 214         arg.nr_pages = nr_pages - nr_vmemmap_pages;
 215         mem_hotplug_begin();
 216         ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
 217         ret = notifier_to_errno(ret);
 218         if (ret)
 219                 goto out_notifier;
 220
 221         if (nr_vmemmap_pages) {
 222                 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
 223                                                 zone, mem->altmap->inaccessible);
 224                 if (ret)
 225                         goto out;
 226         }
 227
 228         ret = online_pages(start_pfn + nr_vmemmap_pages,
 229                            nr_pages - nr_vmemmap_pages, zone, mem->group);
 230         if (ret) {
 231                 if (nr_vmemmap_pages)
 232                         mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 233                 goto out;
 234         }
 235
 236         /*
 237          * Account once onlining succeeded. If the zone was unpopulated, it is
 238          * now already properly populated.
 239          */
 240         if (nr_vmemmap_pages)
 241                 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 242                                           nr_vmemmap_pages);
 243
 244         mem->zone = zone;
 245         mem_hotplug_done();
 246         return ret;
 247 out:
 248         memory_notify(MEM_FINISH_OFFLINE, &arg);
 249 out_notifier:
 250         mem_hotplug_done();
 251         return ret;
 252 }
 253
 254 /*
 255  * Must acquire mem_hotplug_lock in write mode.
 256  */
 257 static int memory_block_offline(struct memory_block *mem)
 258 {
 259         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 260         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 261         unsigned long nr_vmemmap_pages = 0;
 262         struct memory_notify arg;
 263         int ret;
 264
 265         if (!mem->zone)
 266                 return -EINVAL;
 267
 268         /*
 269          * Unaccount before offlining, such that unpopulated zone and kthreads
 270          * can properly be torn down in offline_pages().
 271          */
 272         if (mem->altmap)
 273                 nr_vmemmap_pages = mem->altmap->free;
 274
 275         mem_hotplug_begin();
 276         if (nr_vmemmap_pages)
 277                 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 278                                           -nr_vmemmap_pages);
 279
 280         ret = offline_pages(start_pfn + nr_vmemmap_pages,
 281                             nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
 282         if (ret) {
 283                 /* offline_pages() failed. Account back. */
 284                 if (nr_vmemmap_pages)
 285                         adjust_present_page_count(pfn_to_page(start_pfn),
 286                                                   mem->group, nr_vmemmap_pages);
 287                 goto out;
 288         }
 289
 290         if (nr_vmemmap_pages)
 291                 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 292
 293         mem->zone = NULL;
 294         arg.altmap_start_pfn = start_pfn;
 295         arg.altmap_nr_pages = nr_vmemmap_pages;
 296         arg.start_pfn = start_pfn + nr_vmemmap_pages;
 297         arg.nr_pages = nr_pages - nr_vmemmap_pages;
 298         memory_notify(MEM_FINISH_OFFLINE, &arg);
 299 out:
 300         mem_hotplug_done();
 301         return ret;
 302 }
 303
 304 /*
 305  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 306  * OK to have direct references to sparsemem variables in here.
 307  */
 308 static int
 309 memory_block_action(struct memory_block *mem, unsigned long action)
 310 {
 311         int ret;
 312
 313         switch (action) {
 314         case MEM_ONLINE:
 315                 ret = memory_block_online(mem);
 316                 break;
 317         case MEM_OFFLINE:
 318                 ret = memory_block_offline(mem);
 319                 break;
 320         default:
 321                 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
 322                      "%ld\n", __func__, mem->start_section_nr, action, action);
 323                 ret = -EINVAL;
 324         }
 325
 326         return ret;
 327 }
 328
 329 static int memory_block_change_state(struct memory_block *mem,
 330                 unsigned long to_state, unsigned long from_state_req)
 331 {
 332         int ret = 0;
 333
 334         if (mem->state != from_state_req)
 335                 return -EINVAL;
 336
 337         if (to_state == MEM_OFFLINE)
 338                 mem->state = MEM_GOING_OFFLINE;
 339
 340         ret = memory_block_action(mem, to_state);
 341         mem->state = ret ? from_state_req : to_state;
 342
 343         return ret;
 344 }
 345
 346 /* The device lock serializes operations on memory_subsys_[online|offline] */
 347 static int memory_subsys_online(struct device *dev)
 348 {
 349         struct memory_block *mem = to_memory_block(dev);
 350         int ret;
 351
 352         if (mem->state == MEM_ONLINE)
 353                 return 0;
 354
 355         /*
 356          * When called via device_online() without configuring the online_type,
 357          * we want to default to MMOP_ONLINE.
 358          */
 359         if (mem->online_type == MMOP_OFFLINE)
 360                 mem->online_type = MMOP_ONLINE;
 361
 362         ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 363         mem->online_type = MMOP_OFFLINE;
 364
 365         return ret;
 366 }
 367
 368 static int memory_subsys_offline(struct device *dev)
 369 {
 370         struct memory_block *mem = to_memory_block(dev);
 371
 372         if (mem->state == MEM_OFFLINE)
 373                 return 0;
 374
 375         return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 376 }
 377
 378 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 379                            const char *buf, size_t count)
 380 {
 381         const int online_type = mhp_online_type_from_str(buf);
 382         struct memory_block *mem = to_memory_block(dev);
 383         int ret;
 384
 385         if (online_type < 0)
 386                 return -EINVAL;
 387
 388         ret = lock_device_hotplug_sysfs();
 389         if (ret)
 390                 return ret;
 391
 392         switch (online_type) {
 393         case MMOP_ONLINE_KERNEL:
 394         case MMOP_ONLINE_MOVABLE:
 395         case MMOP_ONLINE:
 396                 /* mem->online_type is protected by device_hotplug_lock */
 397                 mem->online_type = online_type;
 398                 ret = device_online(&mem->dev);
 399                 break;
 400         case MMOP_OFFLINE:
 401                 ret = device_offline(&mem->dev);
 402                 break;
 403         default:
 404                 ret = -EINVAL; /* should never happen */
 405         }
 406
 407         unlock_device_hotplug();
 408
 409         if (ret < 0)
 410                 return ret;
 411         if (ret)
 412                 return -EINVAL;
 413
 414         return count;
 415 }
 416
 417 /*
 418  * Legacy interface that we cannot remove: s390x exposes the storage increment
 419  * covered by a memory block, allowing for identifying which memory blocks
 420  * comprise a storage increment. Since a memory block spans complete
 421  * storage increments nowadays, this interface is basically unused. Other
 422  * archs never exposed != 0.
 423  */
 424 static ssize_t phys_device_show(struct device *dev,
 425                                 struct device_attribute *attr, char *buf)
 426 {
 427         struct memory_block *mem = to_memory_block(dev);
 428         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 429
 430         return sysfs_emit(buf, "%d\n",
 431                           arch_get_memory_phys_device(start_pfn));
 432 }
 433
 434 #ifdef CONFIG_MEMORY_HOTREMOVE
 435 static int print_allowed_zone(char *buf, int len, int nid,
 436                               struct memory_group *group,
 437                               unsigned long start_pfn, unsigned long nr_pages,
 438                               int online_type, struct zone *default_zone)
 439 {
 440         struct zone *zone;
 441
 442         zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
 443         if (zone == default_zone)
 444                 return 0;
 445
 446         return sysfs_emit_at(buf, len, " %s", zone->name);
 447 }
 448
 449 static ssize_t valid_zones_show(struct device *dev,
 450                                 struct device_attribute *attr, char *buf)
 451 {
 452         struct memory_block *mem = to_memory_block(dev);
 453         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 454         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 455         struct memory_group *group = mem->group;
 456         struct zone *default_zone;
 457         int nid = mem->nid;
 458         int len = 0;
 459
 460         /*
 461          * Check the existing zone. Make sure that we do that only on the
 462          * online nodes otherwise the page_zone is not reliable
 463          */
 464         if (mem->state == MEM_ONLINE) {
 465                 /*
 466                  * If !mem->zone, the memory block spans multiple zones and
 467                  * cannot get offlined.
 468                  */
 469                 default_zone = mem->zone;
 470                 if (!default_zone)
 471                         return sysfs_emit(buf, "%s\n", "none");
 472                 len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 473                 goto out;
 474         }
 475
 476         default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
 477                                           start_pfn, nr_pages);
 478
 479         len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 480         len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 481                                   MMOP_ONLINE_KERNEL, default_zone);
 482         len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 483                                   MMOP_ONLINE_MOVABLE, default_zone);
 484 out:
 485         len += sysfs_emit_at(buf, len, "\n");
 486         return len;
 487 }
 488 static DEVICE_ATTR_RO(valid_zones);
 489 #endif
 490
 491 static DEVICE_ATTR_RO(phys_index);
 492 static DEVICE_ATTR_RW(state);
 493 static DEVICE_ATTR_RO(phys_device);
 494 static DEVICE_ATTR_RO(removable);
 495
 496 /*
 497  * Show the memory block size (shared by all memory blocks).
 498  */
 499 static ssize_t block_size_bytes_show(struct device *dev,
 500                                      struct device_attribute *attr, char *buf)
 501 {
 502         return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
 503 }
 504
 505 static DEVICE_ATTR_RO(block_size_bytes);
 506
 507 /*
 508  * Memory auto online policy.
 509  */
 510
 511 static ssize_t auto_online_blocks_show(struct device *dev,
 512                                        struct device_attribute *attr, char *buf)
 513 {
 514         return sysfs_emit(buf, "%s\n",
 515                           online_type_to_str[mhp_default_online_type]);
 516 }
 517
 518 static ssize_t auto_online_blocks_store(struct device *dev,
 519                                         struct device_attribute *attr,
 520                                         const char *buf, size_t count)
 521 {
 522         const int online_type = mhp_online_type_from_str(buf);
 523
 524         if (online_type < 0)
 525                 return -EINVAL;
 526
 527         mhp_default_online_type = online_type;
 528         return count;
 529 }
 530
 531 static DEVICE_ATTR_RW(auto_online_blocks);
 532
 533 #ifdef CONFIG_CRASH_HOTPLUG
 534 #include <linux/kexec.h>
 535 static ssize_t crash_hotplug_show(struct device *dev,
 536                                        struct device_attribute *attr, char *buf)
 537 {
 538         return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support());
 539 }
 540 static DEVICE_ATTR_RO(crash_hotplug);
 541 #endif
 542
 543 /*
 544  * Some architectures will have custom drivers to do this, and
 545  * will not need to do it from userspace.  The fake hot-add code
 546  * as well as ppc64 will do all of their discovery in userspace
 547  * and will require this interface.
 548  */
 549 #ifdef CONFIG_ARCH_MEMORY_PROBE
 550 static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 551                            const char *buf, size_t count)
 552 {
 553         u64 phys_addr;
 554         int nid, ret;
 555         unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 556
 557         ret = kstrtoull(buf, 0, &phys_addr);
 558         if (ret)
 559                 return ret;
 560
 561         if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 562                 return -EINVAL;
 563
 564         ret = lock_device_hotplug_sysfs();
 565         if (ret)
 566                 return ret;
 567
 568         nid = memory_add_physaddr_to_nid(phys_addr);
 569         ret = __add_memory(nid, phys_addr,
 570                            MIN_MEMORY_BLOCK_SIZE * sections_per_block,
 571                            MHP_NONE);
 572
 573         if (ret)
 574                 goto out;
 575
 576         ret = count;
 577 out:
 578         unlock_device_hotplug();
 579         return ret;
 580 }
 581
 582 static DEVICE_ATTR_WO(probe);
 583 #endif
 584
 585 #ifdef CONFIG_MEMORY_FAILURE
 586 /*
 587  * Support for offlining pages of memory
 588  */
 589
 590 /* Soft offline a page */
 591 static ssize_t soft_offline_page_store(struct device *dev,
 592                                        struct device_attribute *attr,
 593                                        const char *buf, size_t count)
 594 {
 595         int ret;
 596         u64 pfn;
 597         if (!capable(CAP_SYS_ADMIN))
 598                 return -EPERM;
 599         if (kstrtoull(buf, 0, &pfn) < 0)
 600                 return -EINVAL;
 601         pfn >>= PAGE_SHIFT;
 602         ret = soft_offline_page(pfn, 0);
 603         return ret == 0 ? count : ret;
 604 }
 605
 606 /* Forcibly offline a page, including killing processes. */
 607 static ssize_t hard_offline_page_store(struct device *dev,
 608                                        struct device_attribute *attr,
 609                                        const char *buf, size_t count)
 610 {
 611         int ret;
 612         u64 pfn;
 613         if (!capable(CAP_SYS_ADMIN))
 614                 return -EPERM;
 615         if (kstrtoull(buf, 0, &pfn) < 0)
 616                 return -EINVAL;
 617         pfn >>= PAGE_SHIFT;
 618         ret = memory_failure(pfn, MF_SW_SIMULATED);
 619         if (ret == -EOPNOTSUPP)
 620                 ret = 0;
 621         return ret ? ret : count;
 622 }
 623
 624 static DEVICE_ATTR_WO(soft_offline_page);
 625 static DEVICE_ATTR_WO(hard_offline_page);
 626 #endif
 627
 628 /* See phys_device_show(). */
 629 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 630 {
 631         return 0;
 632 }
 633
 634 /*
 635  * A reference for the returned memory block device is acquired.
 636  *
 637  * Called under device_hotplug_lock.
 638  */
 639 static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 640 {
 641         struct memory_block *mem;
 642
 643         mem = xa_load(&memory_blocks, block_id);
 644         if (mem)
 645                 get_device(&mem->dev);
 646         return mem;
 647 }
 648
 649 /*
 650  * Called under device_hotplug_lock.
 651  */
 652 struct memory_block *find_memory_block(unsigned long section_nr)
 653 {
 654         unsigned long block_id = memory_block_id(section_nr);
 655
 656         return find_memory_block_by_id(block_id);
 657 }
 658
 659 static struct attribute *memory_memblk_attrs[] = {
 660         &dev_attr_phys_index.attr,
 661         &dev_attr_state.attr,
 662         &dev_attr_phys_device.attr,
 663         &dev_attr_removable.attr,
 664 #ifdef CONFIG_MEMORY_HOTREMOVE
 665         &dev_attr_valid_zones.attr,
 666 #endif
 667         NULL
 668 };
 669
 670 static const struct attribute_group memory_memblk_attr_group = {
 671         .attrs = memory_memblk_attrs,
 672 };
 673
 674 static const struct attribute_group *memory_memblk_attr_groups[] = {
 675         &memory_memblk_attr_group,
 676         NULL,
 677 };
 678
 679 static int __add_memory_block(struct memory_block *memory)
 680 {
 681         int ret;
 682
 683         memory->dev.bus = &memory_subsys;
 684         memory->dev.id = memory->start_section_nr / sections_per_block;
 685         memory->dev.release = memory_block_release;
 686         memory->dev.groups = memory_memblk_attr_groups;
 687         memory->dev.offline = memory->state == MEM_OFFLINE;
 688
 689         ret = device_register(&memory->dev);
 690         if (ret) {
 691                 put_device(&memory->dev);
 692                 return ret;
 693         }
 694         ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
 695                               GFP_KERNEL));
 696         if (ret)
 697                 device_unregister(&memory->dev);
 698
 699         return ret;
 700 }
 701
 702 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
 703                                                      int nid)
 704 {
 705         const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 706         const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 707         struct zone *zone, *matching_zone = NULL;
 708         pg_data_t *pgdat = NODE_DATA(nid);
 709         int i;
 710
 711         /*
 712          * This logic only works for early memory, when the applicable zones
 713          * already span the memory block. We don't expect overlapping zones on
 714          * a single node for early memory. So if we're told that some PFNs
 715          * of a node fall into this memory block, we can assume that all node
 716          * zones that intersect with the memory block are actually applicable.
 717          * No need to look at the memmap.
 718          */
 719         for (i = 0; i < MAX_NR_ZONES; i++) {
 720                 zone = pgdat->node_zones + i;
 721                 if (!populated_zone(zone))
 722                         continue;
 723                 if (!zone_intersects(zone, start_pfn, nr_pages))
 724                         continue;
 725                 if (!matching_zone) {
 726                         matching_zone = zone;
 727                         continue;
 728                 }
 729                 /* Spans multiple zones ... */
 730                 matching_zone = NULL;
 731                 break;
 732         }
 733         return matching_zone;
 734 }
 735
 736 #ifdef CONFIG_NUMA
 737 /**
 738  * memory_block_add_nid() - Indicate that system RAM falling into this memory
 739  *                          block device (partially) belongs to the given node.
 740  * @mem: The memory block device.
 741  * @nid: The node id.
 742  * @context: The memory initialization context.
 743  *
 744  * Indicate that system RAM falling into this memory block (partially) belongs
 745  * to the given node. If the context indicates ("early") that we are adding the
 746  * node during node device subsystem initialization, this will also properly
 747  * set/adjust mem->zone based on the zone ranges of the given node.
 748  */
 749 void memory_block_add_nid(struct memory_block *mem, int nid,
 750                           enum meminit_context context)
 751 {
 752         if (context == MEMINIT_EARLY && mem->nid != nid) {
 753                 /*
 754                  * For early memory we have to determine the zone when setting
 755                  * the node id and handle multiple nodes spanning a single
 756                  * memory block by indicate via zone == NULL that we're not
 757                  * dealing with a single zone. So if we're setting the node id
 758                  * the first time, determine if there is a single zone. If we're
 759                  * setting the node id a second time to a different node,
 760                  * invalidate the single detected zone.
 761                  */
 762                 if (mem->nid == NUMA_NO_NODE)
 763                         mem->zone = early_node_zone_for_memory_block(mem, nid);
 764                 else
 765                         mem->zone = NULL;
 766         }
 767
 768         /*
 769          * If this memory block spans multiple nodes, we only indicate
 770          * the last processed node. If we span multiple nodes (not applicable
 771          * to hotplugged memory), zone == NULL will prohibit memory offlining
 772          * and consequently unplug.
 773          */
 774         mem->nid = nid;
 775 }
 776 #endif
 777
 778 static int add_memory_block(unsigned long block_id, unsigned long state,
 779                             struct vmem_altmap *altmap,
 780                             struct memory_group *group)
 781 {
 782         struct memory_block *mem;
 783         int ret = 0;
 784
 785         mem = find_memory_block_by_id(block_id);
 786         if (mem) {
 787                 put_device(&mem->dev);
 788                 return -EEXIST;
 789         }
 790         mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 791         if (!mem)
 792                 return -ENOMEM;
 793
 794         mem->start_section_nr = block_id * sections_per_block;
 795         mem->state = state;
 796         mem->nid = NUMA_NO_NODE;
 797         mem->altmap = altmap;
 798         INIT_LIST_HEAD(&mem->group_next);
 799
 800 #ifndef CONFIG_NUMA
 801         if (state == MEM_ONLINE)
 802                 /*
 803                  * MEM_ONLINE at this point implies early memory. With NUMA,
 804                  * we'll determine the zone when setting the node id via
 805                  * memory_block_add_nid(). Memory hotplug updated the zone
 806                  * manually when memory onlining/offlining succeeds.
 807                  */
 808                 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
 809 #endif /* CONFIG_NUMA */
 810
 811         ret = __add_memory_block(mem);
 812         if (ret)
 813                 return ret;
 814
 815         if (group) {
 816                 mem->group = group;
 817                 list_add(&mem->group_next, &group->memory_blocks);
 818         }
 819
 820         return 0;
 821 }
 822
 823 static int __init add_boot_memory_block(unsigned long base_section_nr)
 824 {
 825         int section_count = 0;
 826         unsigned long nr;
 827
 828         for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
 829              nr++)
 830                 if (present_section_nr(nr))
 831                         section_count++;
 832
 833         if (section_count == 0)
 834                 return 0;
 835         return add_memory_block(memory_block_id(base_section_nr),
 836                                 MEM_ONLINE, NULL,  NULL);
 837 }
 838
 839 static int add_hotplug_memory_block(unsigned long block_id,
 840                                     struct vmem_altmap *altmap,
 841                                     struct memory_group *group)
 842 {
 843         return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 844 }
 845
 846 static void remove_memory_block(struct memory_block *memory)
 847 {
 848         if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 849                 return;
 850
 851         WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 852
 853         if (memory->group) {
 854                 list_del(&memory->group_next);
 855                 memory->group = NULL;
 856         }
 857
 858         /* drop the ref. we got via find_memory_block() */
 859         put_device(&memory->dev);
 860         device_unregister(&memory->dev);
 861 }
 862
 863 /*
 864  * Create memory block devices for the given memory area. Start and size
 865  * have to be aligned to memory block granularity. Memory block devices
 866  * will be initialized as offline.
 867  *
 868  * Called under device_hotplug_lock.
 869  */
 870 int create_memory_block_devices(unsigned long start, unsigned long size,
 871                                 struct vmem_altmap *altmap,
 872                                 struct memory_group *group)
 873 {
 874         const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 875         unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 876         struct memory_block *mem;
 877         unsigned long block_id;
 878         int ret = 0;
 879
 880         if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 881                          !IS_ALIGNED(size, memory_block_size_bytes())))
 882                 return -EINVAL;
 883
 884         for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 885                 ret = add_hotplug_memory_block(block_id, altmap, group);
 886                 if (ret)
 887                         break;
 888         }
 889         if (ret) {
 890                 end_block_id = block_id;
 891                 for (block_id = start_block_id; block_id != end_block_id;
 892                      block_id++) {
 893                         mem = find_memory_block_by_id(block_id);
 894                         if (WARN_ON_ONCE(!mem))
 895                                 continue;
 896                         remove_memory_block(mem);
 897                 }
 898         }
 899         return ret;
 900 }
 901
 902 /*
 903  * Remove memory block devices for the given memory area. Start and size
 904  * have to be aligned to memory block granularity. Memory block devices
 905  * have to be offline.
 906  *
 907  * Called under device_hotplug_lock.
 908  */
 909 void remove_memory_block_devices(unsigned long start, unsigned long size)
 910 {
 911         const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 912         const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 913         struct memory_block *mem;
 914         unsigned long block_id;
 915
 916         if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 917                          !IS_ALIGNED(size, memory_block_size_bytes())))
 918                 return;
 919
 920         for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 921                 mem = find_memory_block_by_id(block_id);
 922                 if (WARN_ON_ONCE(!mem))
 923                         continue;
 924                 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
 925                 unregister_memory_block_under_nodes(mem);
 926                 remove_memory_block(mem);
 927         }
 928 }
 929
 930 static struct attribute *memory_root_attrs[] = {
 931 #ifdef CONFIG_ARCH_MEMORY_PROBE
 932         &dev_attr_probe.attr,
 933 #endif
 934
 935 #ifdef CONFIG_MEMORY_FAILURE
 936         &dev_attr_soft_offline_page.attr,
 937         &dev_attr_hard_offline_page.attr,
 938 #endif
 939
 940         &dev_attr_block_size_bytes.attr,
 941         &dev_attr_auto_online_blocks.attr,
 942 #ifdef CONFIG_CRASH_HOTPLUG
 943         &dev_attr_crash_hotplug.attr,
 944 #endif
 945         NULL
 946 };
 947
 948 static const struct attribute_group memory_root_attr_group = {
 949         .attrs = memory_root_attrs,
 950 };
 951
 952 static const struct attribute_group *memory_root_attr_groups[] = {
 953         &memory_root_attr_group,
 954         NULL,
 955 };
 956
 957 /*
 958  * Initialize the sysfs support for memory devices. At the time this function
 959  * is called, we cannot have concurrent creation/deletion of memory block
 960  * devices, the device_hotplug_lock is not needed.
 961  */
 962 void __init memory_dev_init(void)
 963 {
 964         int ret;
 965         unsigned long block_sz, nr;
 966
 967         /* Validate the configured memory block size */
 968         block_sz = memory_block_size_bytes();
 969         if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
 970                 panic("Memory block size not suitable: 0x%lx\n", block_sz);
 971         sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 972
 973         ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 974         if (ret)
 975                 panic("%s() failed to register subsystem: %d\n", __func__, ret);
 976
 977         /*
 978          * Create entries for memory sections that were found
 979          * during boot and have been initialized
 980          */
 981         for (nr = 0; nr <= __highest_present_section_nr;
 982              nr += sections_per_block) {
 983                 ret = add_boot_memory_block(nr);
 984                 if (ret)
 985                         panic("%s() failed to add memory block: %d\n", __func__,
 986                               ret);
 987         }
 988 }
 989
 990 /**
 991  * walk_memory_blocks - walk through all present memory blocks overlapped
 992  *                      by the range [start, start + size)
 993  *
 994  * @start: start address of the memory range
 995  * @size: size of the memory range
 996  * @arg: argument passed to func
 997  * @func: callback for each memory section walked
 998  *
 999  * This function walks through all present memory blocks overlapped by the
1000  * range [start, start + size), calling func on each memory block.
1001  *
1002  * In case func() returns an error, walking is aborted and the error is
1003  * returned.
1004  *
1005  * Called under device_hotplug_lock.
1006  */
1007 int walk_memory_blocks(unsigned long start, unsigned long size,
1008                        void *arg, walk_memory_blocks_func_t func)
1009 {
1010         const unsigned long start_block_id = phys_to_block_id(start);
1011         const unsigned long end_block_id = phys_to_block_id(start + size - 1);
1012         struct memory_block *mem;
1013         unsigned long block_id;
1014         int ret = 0;
1015
1016         if (!size)
1017                 return 0;
1018
1019         for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
1020                 mem = find_memory_block_by_id(block_id);
1021                 if (!mem)
1022                         continue;
1023
1024                 ret = func(mem, arg);
1025                 put_device(&mem->dev);
1026                 if (ret)
1027                         break;
1028         }
1029         return ret;
1030 }
1031
1032 struct for_each_memory_block_cb_data {
1033         walk_memory_blocks_func_t func;
1034         void *arg;
1035 };
1036
1037 static int for_each_memory_block_cb(struct device *dev, void *data)
1038 {
1039         struct memory_block *mem = to_memory_block(dev);
1040         struct for_each_memory_block_cb_data *cb_data = data;
1041
1042         return cb_data->func(mem, cb_data->arg);
1043 }
1044
1045 /**
1046  * for_each_memory_block - walk through all present memory blocks
1047  *
1048  * @arg: argument passed to func
1049  * @func: callback for each memory block walked
1050  *
1051  * This function walks through all present memory blocks, calling func on
1052  * each memory block.
1053  *
1054  * In case func() returns an error, walking is aborted and the error is
1055  * returned.
1056  */
1057 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
1058 {
1059         struct for_each_memory_block_cb_data cb_data = {
1060                 .func = func,
1061                 .arg = arg,
1062         };
1063
1064         return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
1065                                 for_each_memory_block_cb);
1066 }
1067
1068 /*
1069  * This is an internal helper to unify allocation and initialization of
1070  * memory groups. Note that the passed memory group will be copied to a
1071  * dynamically allocated memory group. After this call, the passed
1072  * memory group should no longer be used.
1073  */
1074 static int memory_group_register(struct memory_group group)
1075 {
1076         struct memory_group *new_group;
1077         uint32_t mgid;
1078         int ret;
1079
1080         if (!node_possible(group.nid))
1081                 return -EINVAL;
1082
1083         new_group = kzalloc(sizeof(group), GFP_KERNEL);
1084         if (!new_group)
1085                 return -ENOMEM;
1086         *new_group = group;
1087         INIT_LIST_HEAD(&new_group->memory_blocks);
1088
1089         ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
1090                        GFP_KERNEL);
1091         if (ret) {
1092                 kfree(new_group);
1093                 return ret;
1094         } else if (group.is_dynamic) {
1095                 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1096         }
1097         return mgid;
1098 }
1099
1100 /**
1101  * memory_group_register_static() - Register a static memory group.
1102  * @nid: The node id.
1103  * @max_pages: The maximum number of pages we'll have in this static memory
1104  *             group.
1105  *
1106  * Register a new static memory group and return the memory group id.
1107  * All memory in the group belongs to a single unit, such as a DIMM. All
1108  * memory belonging to a static memory group is added in one go to be removed
1109  * in one go -- it's static.
1110  *
1111  * Returns an error if out of memory, if the node id is invalid, if no new
1112  * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1113  * returns the new memory group id.
1114  */
1115 int memory_group_register_static(int nid, unsigned long max_pages)
1116 {
1117         struct memory_group group = {
1118                 .nid = nid,
1119                 .s = {
1120                         .max_pages = max_pages,
1121                 },
1122         };
1123
1124         if (!max_pages)
1125                 return -EINVAL;
1126         return memory_group_register(group);
1127 }
1128 EXPORT_SYMBOL_GPL(memory_group_register_static);
1129
1130 /**
1131  * memory_group_register_dynamic() - Register a dynamic memory group.
1132  * @nid: The node id.
1133  * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1134  *              memory group.
1135  *
1136  * Register a new dynamic memory group and return the memory group id.
1137  * Memory within a dynamic memory group is added/removed dynamically
1138  * in unit_pages.
1139  *
1140  * Returns an error if out of memory, if the node id is invalid, if no new
1141  * memory groups can be registered, or if unit_pages is invalid (0, not a
1142  * power of two, smaller than a single memory block). Otherwise, returns the
1143  * new memory group id.
1144  */
1145 int memory_group_register_dynamic(int nid, unsigned long unit_pages)
1146 {
1147         struct memory_group group = {
1148                 .nid = nid,
1149                 .is_dynamic = true,
1150                 .d = {
1151                         .unit_pages = unit_pages,
1152                 },
1153         };
1154
1155         if (!unit_pages || !is_power_of_2(unit_pages) ||
1156             unit_pages < PHYS_PFN(memory_block_size_bytes()))
1157                 return -EINVAL;
1158         return memory_group_register(group);
1159 }
1160 EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
1161
1162 /**
1163  * memory_group_unregister() - Unregister a memory group.
1164  * @mgid: the memory group id
1165  *
1166  * Unregister a memory group. If any memory block still belongs to this
1167  * memory group, unregistering will fail.
1168  *
1169  * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1170  * memory blocks still belong to this memory group and returns 0 if
1171  * unregistering succeeded.
1172  */
1173 int memory_group_unregister(int mgid)
1174 {
1175         struct memory_group *group;
1176
1177         if (mgid < 0)
1178                 return -EINVAL;
1179
1180         group = xa_load(&memory_groups, mgid);
1181         if (!group)
1182                 return -EINVAL;
1183         if (!list_empty(&group->memory_blocks))
1184                 return -EBUSY;
1185         xa_erase(&memory_groups, mgid);
1186         kfree(group);
1187         return 0;
1188 }
1189 EXPORT_SYMBOL_GPL(memory_group_unregister);
1190
1191 /*
1192  * This is an internal helper only to be used in core memory hotplug code to
1193  * lookup a memory group. We don't care about locking, as we don't expect a
1194  * memory group to get unregistered while adding memory to it -- because
1195  * the group and the memory is managed by the same driver.
1196  */
1197 struct memory_group *memory_group_find_by_id(int mgid)
1198 {
1199         return xa_load(&memory_groups, mgid);
1200 }
1201
1202 /*
1203  * This is an internal helper only to be used in core memory hotplug code to
1204  * walk all dynamic memory groups excluding a given memory group, either
1205  * belonging to a specific node, or belonging to any node.
1206  */
1207 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
1208                                struct memory_group *excluded, void *arg)
1209 {
1210         struct memory_group *group;
1211         unsigned long index;
1212         int ret = 0;
1213
1214         xa_for_each_marked(&memory_groups, index, group,
1215                            MEMORY_GROUP_MARK_DYNAMIC) {
1216                 if (group == excluded)
1217                         continue;
1218 #ifdef CONFIG_NUMA
1219                 if (nid != NUMA_NO_NODE && group->nid != nid)
1220                         continue;
1221 #endif /* CONFIG_NUMA */
1222                 ret = func(group, arg);
1223                 if (ret)
1224                         break;
1225         }
1226         return ret;
1227 }
1228
1229 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
1230 void memblk_nr_poison_inc(unsigned long pfn)
1231 {
1232         const unsigned long block_id = pfn_to_block_id(pfn);
1233         struct memory_block *mem = find_memory_block_by_id(block_id);
1234
1235         if (mem)
1236                 atomic_long_inc(&mem->nr_hwpoison);
1237 }
1238
1239 void memblk_nr_poison_sub(unsigned long pfn, long i)
1240 {
1241         const unsigned long block_id = pfn_to_block_id(pfn);
1242         struct memory_block *mem = find_memory_block_by_id(block_id);
1243
1244         if (mem)
1245                 atomic_long_sub(i, &mem->nr_hwpoison);
1246 }
1247
1248 static unsigned long memblk_nr_poison(struct memory_block *mem)
1249 {
1250         return atomic_long_read(&mem->nr_hwpoison);
1251 }
1252 #endif