1 // SPDX-License-Identifier: GPL-2.0
3 * Basic Node interface support
6 #include <linux/module.h>
7 #include <linux/init.h>
9 #include <linux/memory.h>
10 #include <linux/vmstat.h>
11 #include <linux/notifier.h>
12 #include <linux/node.h>
13 #include <linux/hugetlb.h>
14 #include <linux/compaction.h>
15 #include <linux/cpumask.h>
16 #include <linux/topology.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/device.h>
20 #include <linux/pm_runtime.h>
21 #include <linux/swap.h>
22 #include <linux/slab.h>
24 static struct bus_type node_subsys = {
30 static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
34 struct node *node_dev = to_node(dev);
36 /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
37 BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
39 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
42 cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
43 n = cpumap_print_to_pagebuf(list, buf, mask);
44 free_cpumask_var(mask);
49 static inline ssize_t node_read_cpumask(struct device *dev,
50 struct device_attribute *attr, char *buf)
52 return node_read_cpumap(dev, false, buf);
54 static inline ssize_t node_read_cpulist(struct device *dev,
55 struct device_attribute *attr, char *buf)
57 return node_read_cpumap(dev, true, buf);
60 static DEVICE_ATTR(cpumap, S_IRUGO, node_read_cpumask, NULL);
61 static DEVICE_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
64 * struct node_access_nodes - Access class device to hold user visible
65 * relationships to other nodes.
66 * @dev: Device for this memory access class
67 * @list_node: List element in the node's access list
68 * @access: The access class rank
70 struct node_access_nodes {
72 struct list_head list_node;
75 #define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev)
77 static struct attribute *node_init_access_node_attrs[] = {
81 static struct attribute *node_targ_access_node_attrs[] = {
85 static const struct attribute_group initiators = {
87 .attrs = node_init_access_node_attrs,
90 static const struct attribute_group targets = {
92 .attrs = node_targ_access_node_attrs,
95 static const struct attribute_group *node_access_node_groups[] = {
101 static void node_remove_accesses(struct node *node)
103 struct node_access_nodes *c, *cnext;
105 list_for_each_entry_safe(c, cnext, &node->access_list, list_node) {
106 list_del(&c->list_node);
107 device_unregister(&c->dev);
111 static void node_access_release(struct device *dev)
113 kfree(to_access_nodes(dev));
116 static struct node_access_nodes *node_init_node_access(struct node *node,
119 struct node_access_nodes *access_node;
122 list_for_each_entry(access_node, &node->access_list, list_node)
123 if (access_node->access == access)
126 access_node = kzalloc(sizeof(*access_node), GFP_KERNEL);
130 access_node->access = access;
131 dev = &access_node->dev;
132 dev->parent = &node->dev;
133 dev->release = node_access_release;
134 dev->groups = node_access_node_groups;
135 if (dev_set_name(dev, "access%u", access))
138 if (device_register(dev))
141 pm_runtime_no_callbacks(dev);
142 list_add_tail(&access_node->list_node, &node->access_list);
145 kfree_const(dev->kobj.name);
151 #define K(x) ((x) << (PAGE_SHIFT - 10))
152 static ssize_t node_read_meminfo(struct device *dev,
153 struct device_attribute *attr, char *buf)
157 struct pglist_data *pgdat = NODE_DATA(nid);
159 unsigned long sreclaimable, sunreclaimable;
161 si_meminfo_node(&i, nid);
162 sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
163 sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
165 "Node %d MemTotal: %8lu kB\n"
166 "Node %d MemFree: %8lu kB\n"
167 "Node %d MemUsed: %8lu kB\n"
168 "Node %d Active: %8lu kB\n"
169 "Node %d Inactive: %8lu kB\n"
170 "Node %d Active(anon): %8lu kB\n"
171 "Node %d Inactive(anon): %8lu kB\n"
172 "Node %d Active(file): %8lu kB\n"
173 "Node %d Inactive(file): %8lu kB\n"
174 "Node %d Unevictable: %8lu kB\n"
175 "Node %d Mlocked: %8lu kB\n",
178 nid, K(i.totalram - i.freeram),
179 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
180 node_page_state(pgdat, NR_ACTIVE_FILE)),
181 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
182 node_page_state(pgdat, NR_INACTIVE_FILE)),
183 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
184 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
185 nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
186 nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
187 nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
188 nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
190 #ifdef CONFIG_HIGHMEM
191 n += sprintf(buf + n,
192 "Node %d HighTotal: %8lu kB\n"
193 "Node %d HighFree: %8lu kB\n"
194 "Node %d LowTotal: %8lu kB\n"
195 "Node %d LowFree: %8lu kB\n",
198 nid, K(i.totalram - i.totalhigh),
199 nid, K(i.freeram - i.freehigh));
201 n += sprintf(buf + n,
202 "Node %d Dirty: %8lu kB\n"
203 "Node %d Writeback: %8lu kB\n"
204 "Node %d FilePages: %8lu kB\n"
205 "Node %d Mapped: %8lu kB\n"
206 "Node %d AnonPages: %8lu kB\n"
207 "Node %d Shmem: %8lu kB\n"
208 "Node %d KernelStack: %8lu kB\n"
209 "Node %d PageTables: %8lu kB\n"
210 "Node %d NFS_Unstable: %8lu kB\n"
211 "Node %d Bounce: %8lu kB\n"
212 "Node %d WritebackTmp: %8lu kB\n"
213 "Node %d KReclaimable: %8lu kB\n"
214 "Node %d Slab: %8lu kB\n"
215 "Node %d SReclaimable: %8lu kB\n"
216 "Node %d SUnreclaim: %8lu kB\n"
217 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
218 "Node %d AnonHugePages: %8lu kB\n"
219 "Node %d ShmemHugePages: %8lu kB\n"
220 "Node %d ShmemPmdMapped: %8lu kB\n"
223 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
224 nid, K(node_page_state(pgdat, NR_WRITEBACK)),
225 nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
226 nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
227 nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
229 nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
230 nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
231 nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
232 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
233 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
234 nid, K(sreclaimable +
235 node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
236 nid, K(sreclaimable + sunreclaimable),
237 nid, K(sreclaimable),
238 nid, K(sunreclaimable)
239 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
241 nid, K(node_page_state(pgdat, NR_ANON_THPS) *
243 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
245 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
249 n += hugetlb_report_node_meminfo(nid, buf + n);
254 static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
256 static ssize_t node_read_numastat(struct device *dev,
257 struct device_attribute *attr, char *buf)
263 "interleave_hit %lu\n"
266 sum_zone_numa_state(dev->id, NUMA_HIT),
267 sum_zone_numa_state(dev->id, NUMA_MISS),
268 sum_zone_numa_state(dev->id, NUMA_FOREIGN),
269 sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
270 sum_zone_numa_state(dev->id, NUMA_LOCAL),
271 sum_zone_numa_state(dev->id, NUMA_OTHER));
273 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
275 static ssize_t node_read_vmstat(struct device *dev,
276 struct device_attribute *attr, char *buf)
279 struct pglist_data *pgdat = NODE_DATA(nid);
283 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
284 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
285 sum_zone_node_page_state(nid, i));
288 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
289 n += sprintf(buf+n, "%s %lu\n",
290 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
291 sum_zone_numa_state(nid, i));
294 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
295 n += sprintf(buf+n, "%s %lu\n",
296 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
297 NR_VM_NUMA_STAT_ITEMS],
298 node_page_state(pgdat, i));
302 static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
304 static ssize_t node_read_distance(struct device *dev,
305 struct device_attribute *attr, char *buf)
312 * buf is currently PAGE_SIZE in length and each node needs 4 chars
313 * at the most (distance + space or newline).
315 BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
317 for_each_online_node(i)
318 len += sprintf(buf + len, "%s%d", i ? " " : "", node_distance(nid, i));
320 len += sprintf(buf + len, "\n");
323 static DEVICE_ATTR(distance, S_IRUGO, node_read_distance, NULL);
325 static struct attribute *node_dev_attrs[] = {
326 &dev_attr_cpumap.attr,
327 &dev_attr_cpulist.attr,
328 &dev_attr_meminfo.attr,
329 &dev_attr_numastat.attr,
330 &dev_attr_distance.attr,
331 &dev_attr_vmstat.attr,
334 ATTRIBUTE_GROUPS(node_dev);
336 #ifdef CONFIG_HUGETLBFS
338 * hugetlbfs per node attributes registration interface:
339 * When/if hugetlb[fs] subsystem initializes [sometime after this module],
340 * it will register its per node attributes for all online nodes with
341 * memory. It will also call register_hugetlbfs_with_node(), below, to
342 * register its attribute registration functions with this node driver.
343 * Once these hooks have been initialized, the node driver will call into
344 * the hugetlb module to [un]register attributes for hot-plugged nodes.
346 static node_registration_func_t __hugetlb_register_node;
347 static node_registration_func_t __hugetlb_unregister_node;
349 static inline bool hugetlb_register_node(struct node *node)
351 if (__hugetlb_register_node &&
352 node_state(node->dev.id, N_MEMORY)) {
353 __hugetlb_register_node(node);
359 static inline void hugetlb_unregister_node(struct node *node)
361 if (__hugetlb_unregister_node)
362 __hugetlb_unregister_node(node);
365 void register_hugetlbfs_with_node(node_registration_func_t doregister,
366 node_registration_func_t unregister)
368 __hugetlb_register_node = doregister;
369 __hugetlb_unregister_node = unregister;
372 static inline void hugetlb_register_node(struct node *node) {}
374 static inline void hugetlb_unregister_node(struct node *node) {}
377 static void node_device_release(struct device *dev)
379 struct node *node = to_node(dev);
381 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
383 * We schedule the work only when a memory section is
384 * onlined/offlined on this node. When we come here,
385 * all the memory on this node has been offlined,
386 * so we won't enqueue new work to this work.
388 * The work is using node->node_work, so we should
389 * flush work before freeing the memory.
391 flush_work(&node->node_work);
397 * register_node - Setup a sysfs device for a node.
398 * @num - Node number to use when creating the device.
400 * Initialize and register the node device.
402 static int register_node(struct node *node, int num)
407 node->dev.bus = &node_subsys;
408 node->dev.release = node_device_release;
409 node->dev.groups = node_dev_groups;
410 error = device_register(&node->dev);
413 put_device(&node->dev);
415 hugetlb_register_node(node);
417 compaction_register_node(node);
423 * unregister_node - unregister a node device
424 * @node: node going away
426 * Unregisters a node device @node. All the devices on the node must be
427 * unregistered before calling this function.
429 void unregister_node(struct node *node)
431 hugetlb_unregister_node(node); /* no-op, if memoryless node */
432 node_remove_accesses(node);
433 device_unregister(&node->dev);
436 struct node *node_devices[MAX_NUMNODES];
439 * register cpu under node
441 int register_cpu_under_node(unsigned int cpu, unsigned int nid)
446 if (!node_online(nid))
449 obj = get_cpu_device(cpu);
453 ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
455 kobject_name(&obj->kobj));
459 return sysfs_create_link(&obj->kobj,
460 &node_devices[nid]->dev.kobj,
461 kobject_name(&node_devices[nid]->dev.kobj));
465 * register_memory_node_under_compute_node - link memory node to its compute
466 * node for a given access class.
467 * @mem_node: Memory node number
468 * @cpu_node: Cpu node number
469 * @access: Access class to register
472 * For use with platforms that may have separate memory and compute nodes.
473 * This function will export node relationships linking which memory
474 * initiator nodes can access memory targets at a given ranked access
477 int register_memory_node_under_compute_node(unsigned int mem_nid,
478 unsigned int cpu_nid,
481 struct node *init_node, *targ_node;
482 struct node_access_nodes *initiator, *target;
485 if (!node_online(cpu_nid) || !node_online(mem_nid))
488 init_node = node_devices[cpu_nid];
489 targ_node = node_devices[mem_nid];
490 initiator = node_init_node_access(init_node, access);
491 target = node_init_node_access(targ_node, access);
492 if (!initiator || !target)
495 ret = sysfs_add_link_to_group(&initiator->dev.kobj, "targets",
496 &targ_node->dev.kobj,
497 dev_name(&targ_node->dev));
501 ret = sysfs_add_link_to_group(&target->dev.kobj, "initiators",
502 &init_node->dev.kobj,
503 dev_name(&init_node->dev));
509 sysfs_remove_link_from_group(&initiator->dev.kobj, "targets",
510 dev_name(&targ_node->dev));
514 int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
518 if (!node_online(nid))
521 obj = get_cpu_device(cpu);
525 sysfs_remove_link(&node_devices[nid]->dev.kobj,
526 kobject_name(&obj->kobj));
527 sysfs_remove_link(&obj->kobj,
528 kobject_name(&node_devices[nid]->dev.kobj));
533 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
534 static int __ref get_nid_for_pfn(unsigned long pfn)
536 if (!pfn_valid_within(pfn))
538 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
539 if (system_state < SYSTEM_RUNNING)
540 return early_pfn_to_nid(pfn);
542 return pfn_to_nid(pfn);
545 /* register memory section under specified node if it spans that node */
546 int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
548 int ret, nid = *(int *)arg;
549 unsigned long pfn, sect_start_pfn, sect_end_pfn;
553 sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
554 sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
555 sect_end_pfn += PAGES_PER_SECTION - 1;
556 for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
560 * memory block could have several absent sections from start.
561 * skip pfn range from absent section
563 if (!pfn_present(pfn)) {
564 pfn = round_down(pfn + PAGES_PER_SECTION,
565 PAGES_PER_SECTION) - 1;
570 * We need to check if page belongs to nid only for the boot
571 * case, during hotplug we know that all pages in the memory
572 * block belong to the same node.
574 if (system_state == SYSTEM_BOOTING) {
575 page_nid = get_nid_for_pfn(pfn);
581 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
583 kobject_name(&mem_blk->dev.kobj));
587 return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
588 &node_devices[nid]->dev.kobj,
589 kobject_name(&node_devices[nid]->dev.kobj));
591 /* mem section does not span the specified node */
595 /* unregister memory section under all nodes that it spans */
596 int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
597 unsigned long phys_index)
599 NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
600 unsigned long pfn, sect_start_pfn, sect_end_pfn;
603 NODEMASK_FREE(unlinked_nodes);
608 nodes_clear(*unlinked_nodes);
610 sect_start_pfn = section_nr_to_pfn(phys_index);
611 sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
612 for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
615 nid = get_nid_for_pfn(pfn);
618 if (!node_online(nid))
620 if (node_test_and_set(nid, *unlinked_nodes))
622 sysfs_remove_link(&node_devices[nid]->dev.kobj,
623 kobject_name(&mem_blk->dev.kobj));
624 sysfs_remove_link(&mem_blk->dev.kobj,
625 kobject_name(&node_devices[nid]->dev.kobj));
627 NODEMASK_FREE(unlinked_nodes);
631 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
633 return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
634 register_mem_sect_under_node);
637 #ifdef CONFIG_HUGETLBFS
639 * Handle per node hstate attribute [un]registration on transistions
640 * to/from memoryless state.
642 static void node_hugetlb_work(struct work_struct *work)
644 struct node *node = container_of(work, struct node, node_work);
647 * We only get here when a node transitions to/from memoryless state.
648 * We can detect which transition occurred by examining whether the
649 * node has memory now. hugetlb_register_node() already check this
650 * so we try to register the attributes. If that fails, then the
651 * node has transitioned to memoryless, try to unregister the
654 if (!hugetlb_register_node(node))
655 hugetlb_unregister_node(node);
658 static void init_node_hugetlb_work(int nid)
660 INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
663 static int node_memory_callback(struct notifier_block *self,
664 unsigned long action, void *arg)
666 struct memory_notify *mnb = arg;
667 int nid = mnb->status_change_nid;
673 * offload per node hstate [un]registration to a work thread
674 * when transitioning to/from memoryless state.
676 if (nid != NUMA_NO_NODE)
677 schedule_work(&node_devices[nid]->node_work);
680 case MEM_GOING_ONLINE:
681 case MEM_GOING_OFFLINE:
682 case MEM_CANCEL_ONLINE:
683 case MEM_CANCEL_OFFLINE:
690 #endif /* CONFIG_HUGETLBFS */
691 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
693 #if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
694 !defined(CONFIG_HUGETLBFS)
695 static inline int node_memory_callback(struct notifier_block *self,
696 unsigned long action, void *arg)
701 static void init_node_hugetlb_work(int nid) { }
705 int __register_one_node(int nid)
710 node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
711 if (!node_devices[nid])
714 error = register_node(node_devices[nid], nid);
716 /* link cpu under this node */
717 for_each_present_cpu(cpu) {
718 if (cpu_to_node(cpu) == nid)
719 register_cpu_under_node(cpu, nid);
722 INIT_LIST_HEAD(&node_devices[nid]->access_list);
723 /* initialize work queue for memory hot plug */
724 init_node_hugetlb_work(nid);
729 void unregister_one_node(int nid)
731 if (!node_devices[nid])
734 unregister_node(node_devices[nid]);
735 node_devices[nid] = NULL;
739 * node states attributes
742 static ssize_t print_nodes_state(enum node_states state, char *buf)
746 n = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
747 nodemask_pr_args(&node_states[state]));
754 struct device_attribute attr;
755 enum node_states state;
758 static ssize_t show_node_state(struct device *dev,
759 struct device_attribute *attr, char *buf)
761 struct node_attr *na = container_of(attr, struct node_attr, attr);
762 return print_nodes_state(na->state, buf);
765 #define _NODE_ATTR(name, state) \
766 { __ATTR(name, 0444, show_node_state, NULL), state }
768 static struct node_attr node_state_attr[] = {
769 [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
770 [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
771 [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
772 #ifdef CONFIG_HIGHMEM
773 [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
775 [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
776 [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
779 static struct attribute *node_state_attrs[] = {
780 &node_state_attr[N_POSSIBLE].attr.attr,
781 &node_state_attr[N_ONLINE].attr.attr,
782 &node_state_attr[N_NORMAL_MEMORY].attr.attr,
783 #ifdef CONFIG_HIGHMEM
784 &node_state_attr[N_HIGH_MEMORY].attr.attr,
786 &node_state_attr[N_MEMORY].attr.attr,
787 &node_state_attr[N_CPU].attr.attr,
791 static struct attribute_group memory_root_attr_group = {
792 .attrs = node_state_attrs,
795 static const struct attribute_group *cpu_root_attr_groups[] = {
796 &memory_root_attr_group,
800 #define NODE_CALLBACK_PRI 2 /* lower than SLAB */
801 static int __init register_node_type(void)
805 BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
806 BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);
808 ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
810 static struct notifier_block node_memory_callback_nb = {
811 .notifier_call = node_memory_callback,
812 .priority = NODE_CALLBACK_PRI,
814 register_hotmemory_notifier(&node_memory_callback_nb);
818 * Note: we're not going to unregister the node class if we fail
819 * to register the node state class attribute files.
823 postcore_initcall(register_node_type);