Merge tag 'hid-for-linus-2023121901' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / mm / hugetlb_cgroup.c
1 /*
2  *
3  * Copyright IBM Corporation, 2012
4  * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5  *
6  * Cgroup v2
7  * Copyright (C) 2019 Red Hat, Inc.
8  * Author: Giuseppe Scrivano <gscrivan@redhat.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of version 2.1 of the GNU Lesser General Public License
12  * as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope that it would be useful, but
15  * WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
17  *
18  */
19
20 #include <linux/cgroup.h>
21 #include <linux/page_counter.h>
22 #include <linux/slab.h>
23 #include <linux/hugetlb.h>
24 #include <linux/hugetlb_cgroup.h>
25
26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
27 #define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
28 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
29
30 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
31
32 static inline struct page_counter *
33 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
34                                      bool rsvd)
35 {
36         if (rsvd)
37                 return &h_cg->rsvd_hugepage[idx];
38         return &h_cg->hugepage[idx];
39 }
40
41 static inline struct page_counter *
42 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
43 {
44         return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
45 }
46
47 static inline struct page_counter *
48 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
49 {
50         return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
51 }
52
53 static inline
54 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
55 {
56         return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
57 }
58
59 static inline
60 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
61 {
62         return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
63 }
64
65 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
66 {
67         return (h_cg == root_h_cgroup);
68 }
69
70 static inline struct hugetlb_cgroup *
71 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
72 {
73         return hugetlb_cgroup_from_css(h_cg->css.parent);
74 }
75
76 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
77 {
78         struct hstate *h;
79
80         for_each_hstate(h) {
81                 if (page_counter_read(
82                     hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
83                         return true;
84         }
85         return false;
86 }
87
88 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
89                                 struct hugetlb_cgroup *parent_h_cgroup)
90 {
91         int idx;
92
93         for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
94                 struct page_counter *fault_parent = NULL;
95                 struct page_counter *rsvd_parent = NULL;
96                 unsigned long limit;
97                 int ret;
98
99                 if (parent_h_cgroup) {
100                         fault_parent = hugetlb_cgroup_counter_from_cgroup(
101                                 parent_h_cgroup, idx);
102                         rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
103                                 parent_h_cgroup, idx);
104                 }
105                 page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
106                                                                      idx),
107                                   fault_parent);
108                 page_counter_init(
109                         hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
110                         rsvd_parent);
111
112                 limit = round_down(PAGE_COUNTER_MAX,
113                                    pages_per_huge_page(&hstates[idx]));
114
115                 ret = page_counter_set_max(
116                         hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
117                         limit);
118                 VM_BUG_ON(ret);
119                 ret = page_counter_set_max(
120                         hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
121                         limit);
122                 VM_BUG_ON(ret);
123         }
124 }
125
126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
127 {
128         int node;
129
130         for_each_node(node)
131                 kfree(h_cgroup->nodeinfo[node]);
132         kfree(h_cgroup);
133 }
134
135 static struct cgroup_subsys_state *
136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
137 {
138         struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
139         struct hugetlb_cgroup *h_cgroup;
140         int node;
141
142         h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
143                            GFP_KERNEL);
144
145         if (!h_cgroup)
146                 return ERR_PTR(-ENOMEM);
147
148         if (!parent_h_cgroup)
149                 root_h_cgroup = h_cgroup;
150
151         /*
152          * TODO: this routine can waste much memory for nodes which will
153          * never be onlined. It's better to use memory hotplug callback
154          * function.
155          */
156         for_each_node(node) {
157                 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
158                 int node_to_alloc =
159                         node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
160                 h_cgroup->nodeinfo[node] =
161                         kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
162                                      GFP_KERNEL, node_to_alloc);
163                 if (!h_cgroup->nodeinfo[node])
164                         goto fail_alloc_nodeinfo;
165         }
166
167         hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
168         return &h_cgroup->css;
169
170 fail_alloc_nodeinfo:
171         hugetlb_cgroup_free(h_cgroup);
172         return ERR_PTR(-ENOMEM);
173 }
174
175 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
176 {
177         hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
178 }
179
180 /*
181  * Should be called with hugetlb_lock held.
182  * Since we are holding hugetlb_lock, pages cannot get moved from
183  * active list or uncharged from the cgroup, So no need to get
184  * page reference and test for page active here. This function
185  * cannot fail.
186  */
187 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
188                                        struct page *page)
189 {
190         unsigned int nr_pages;
191         struct page_counter *counter;
192         struct hugetlb_cgroup *page_hcg;
193         struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
194         struct folio *folio = page_folio(page);
195
196         page_hcg = hugetlb_cgroup_from_folio(folio);
197         /*
198          * We can have pages in active list without any cgroup
199          * ie, hugepage with less than 3 pages. We can safely
200          * ignore those pages.
201          */
202         if (!page_hcg || page_hcg != h_cg)
203                 goto out;
204
205         nr_pages = compound_nr(page);
206         if (!parent) {
207                 parent = root_h_cgroup;
208                 /* root has no limit */
209                 page_counter_charge(&parent->hugepage[idx], nr_pages);
210         }
211         counter = &h_cg->hugepage[idx];
212         /* Take the pages off the local counter */
213         page_counter_cancel(counter, nr_pages);
214
215         set_hugetlb_cgroup(folio, parent);
216 out:
217         return;
218 }
219
220 /*
221  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
222  * the parent cgroup.
223  */
224 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
225 {
226         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
227         struct hstate *h;
228         struct page *page;
229
230         do {
231                 for_each_hstate(h) {
232                         spin_lock_irq(&hugetlb_lock);
233                         list_for_each_entry(page, &h->hugepage_activelist, lru)
234                                 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
235
236                         spin_unlock_irq(&hugetlb_lock);
237                 }
238                 cond_resched();
239         } while (hugetlb_cgroup_have_usage(h_cg));
240 }
241
242 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
243                                  enum hugetlb_memory_event event)
244 {
245         atomic_long_inc(&hugetlb->events_local[idx][event]);
246         cgroup_file_notify(&hugetlb->events_local_file[idx]);
247
248         do {
249                 atomic_long_inc(&hugetlb->events[idx][event]);
250                 cgroup_file_notify(&hugetlb->events_file[idx]);
251         } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
252                  !hugetlb_cgroup_is_root(hugetlb));
253 }
254
255 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
256                                           struct hugetlb_cgroup **ptr,
257                                           bool rsvd)
258 {
259         int ret = 0;
260         struct page_counter *counter;
261         struct hugetlb_cgroup *h_cg = NULL;
262
263         if (hugetlb_cgroup_disabled())
264                 goto done;
265 again:
266         rcu_read_lock();
267         h_cg = hugetlb_cgroup_from_task(current);
268         if (!css_tryget(&h_cg->css)) {
269                 rcu_read_unlock();
270                 goto again;
271         }
272         rcu_read_unlock();
273
274         if (!page_counter_try_charge(
275                     __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
276                     nr_pages, &counter)) {
277                 ret = -ENOMEM;
278                 hugetlb_event(h_cg, idx, HUGETLB_MAX);
279                 css_put(&h_cg->css);
280                 goto done;
281         }
282         /* Reservations take a reference to the css because they do not get
283          * reparented.
284          */
285         if (!rsvd)
286                 css_put(&h_cg->css);
287 done:
288         *ptr = h_cg;
289         return ret;
290 }
291
292 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
293                                  struct hugetlb_cgroup **ptr)
294 {
295         return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
296 }
297
298 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
299                                       struct hugetlb_cgroup **ptr)
300 {
301         return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
302 }
303
304 /* Should be called with hugetlb_lock held */
305 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
306                                            struct hugetlb_cgroup *h_cg,
307                                            struct folio *folio, bool rsvd)
308 {
309         if (hugetlb_cgroup_disabled() || !h_cg)
310                 return;
311
312         __set_hugetlb_cgroup(folio, h_cg, rsvd);
313         if (!rsvd) {
314                 unsigned long usage =
315                         h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
316                 /*
317                  * This write is not atomic due to fetching usage and writing
318                  * to it, but that's fine because we call this with
319                  * hugetlb_lock held anyway.
320                  */
321                 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
322                            usage + nr_pages);
323         }
324 }
325
326 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
327                                   struct hugetlb_cgroup *h_cg,
328                                   struct folio *folio)
329 {
330         __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
331 }
332
333 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
334                                        struct hugetlb_cgroup *h_cg,
335                                        struct folio *folio)
336 {
337         __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
338 }
339
340 /*
341  * Should be called with hugetlb_lock held
342  */
343 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
344                                            struct folio *folio, bool rsvd)
345 {
346         struct hugetlb_cgroup *h_cg;
347
348         if (hugetlb_cgroup_disabled())
349                 return;
350         lockdep_assert_held(&hugetlb_lock);
351         h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
352         if (unlikely(!h_cg))
353                 return;
354         __set_hugetlb_cgroup(folio, NULL, rsvd);
355
356         page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
357                                                                    rsvd),
358                               nr_pages);
359
360         if (rsvd)
361                 css_put(&h_cg->css);
362         else {
363                 unsigned long usage =
364                         h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
365                 /*
366                  * This write is not atomic due to fetching usage and writing
367                  * to it, but that's fine because we call this with
368                  * hugetlb_lock held anyway.
369                  */
370                 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
371                            usage - nr_pages);
372         }
373 }
374
375 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
376                                   struct folio *folio)
377 {
378         __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
379 }
380
381 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
382                                        struct folio *folio)
383 {
384         __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
385 }
386
387 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
388                                              struct hugetlb_cgroup *h_cg,
389                                              bool rsvd)
390 {
391         if (hugetlb_cgroup_disabled() || !h_cg)
392                 return;
393
394         page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
395                                                                    rsvd),
396                               nr_pages);
397
398         if (rsvd)
399                 css_put(&h_cg->css);
400 }
401
402 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
403                                     struct hugetlb_cgroup *h_cg)
404 {
405         __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
406 }
407
408 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
409                                          struct hugetlb_cgroup *h_cg)
410 {
411         __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
412 }
413
414 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
415                                      unsigned long end)
416 {
417         if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
418             !resv->css)
419                 return;
420
421         page_counter_uncharge(resv->reservation_counter,
422                               (end - start) * resv->pages_per_hpage);
423         css_put(resv->css);
424 }
425
426 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
427                                          struct file_region *rg,
428                                          unsigned long nr_pages,
429                                          bool region_del)
430 {
431         if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
432                 return;
433
434         if (rg->reservation_counter && resv->pages_per_hpage &&
435             !resv->reservation_counter) {
436                 page_counter_uncharge(rg->reservation_counter,
437                                       nr_pages * resv->pages_per_hpage);
438                 /*
439                  * Only do css_put(rg->css) when we delete the entire region
440                  * because one file_region must hold exactly one css reference.
441                  */
442                 if (region_del)
443                         css_put(rg->css);
444         }
445 }
446
447 enum {
448         RES_USAGE,
449         RES_RSVD_USAGE,
450         RES_LIMIT,
451         RES_RSVD_LIMIT,
452         RES_MAX_USAGE,
453         RES_RSVD_MAX_USAGE,
454         RES_FAILCNT,
455         RES_RSVD_FAILCNT,
456 };
457
458 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
459 {
460         int nid;
461         struct cftype *cft = seq_cft(seq);
462         int idx = MEMFILE_IDX(cft->private);
463         bool legacy = MEMFILE_ATTR(cft->private);
464         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
465         struct cgroup_subsys_state *css;
466         unsigned long usage;
467
468         if (legacy) {
469                 /* Add up usage across all nodes for the non-hierarchical total. */
470                 usage = 0;
471                 for_each_node_state(nid, N_MEMORY)
472                         usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
473                 seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
474
475                 /* Simply print the per-node usage for the non-hierarchical total. */
476                 for_each_node_state(nid, N_MEMORY)
477                         seq_printf(seq, " N%d=%lu", nid,
478                                    READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
479                                            PAGE_SIZE);
480                 seq_putc(seq, '\n');
481         }
482
483         /*
484          * The hierarchical total is pretty much the value recorded by the
485          * counter, so use that.
486          */
487         seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
488                    page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
489
490         /*
491          * For each node, transverse the css tree to obtain the hierarchical
492          * node usage.
493          */
494         for_each_node_state(nid, N_MEMORY) {
495                 usage = 0;
496                 rcu_read_lock();
497                 css_for_each_descendant_pre(css, &h_cg->css) {
498                         usage += READ_ONCE(hugetlb_cgroup_from_css(css)
499                                                    ->nodeinfo[nid]
500                                                    ->usage[idx]);
501                 }
502                 rcu_read_unlock();
503                 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
504         }
505
506         seq_putc(seq, '\n');
507
508         return 0;
509 }
510
511 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
512                                    struct cftype *cft)
513 {
514         struct page_counter *counter;
515         struct page_counter *rsvd_counter;
516         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
517
518         counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
519         rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
520
521         switch (MEMFILE_ATTR(cft->private)) {
522         case RES_USAGE:
523                 return (u64)page_counter_read(counter) * PAGE_SIZE;
524         case RES_RSVD_USAGE:
525                 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
526         case RES_LIMIT:
527                 return (u64)counter->max * PAGE_SIZE;
528         case RES_RSVD_LIMIT:
529                 return (u64)rsvd_counter->max * PAGE_SIZE;
530         case RES_MAX_USAGE:
531                 return (u64)counter->watermark * PAGE_SIZE;
532         case RES_RSVD_MAX_USAGE:
533                 return (u64)rsvd_counter->watermark * PAGE_SIZE;
534         case RES_FAILCNT:
535                 return counter->failcnt;
536         case RES_RSVD_FAILCNT:
537                 return rsvd_counter->failcnt;
538         default:
539                 BUG();
540         }
541 }
542
543 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
544 {
545         int idx;
546         u64 val;
547         struct cftype *cft = seq_cft(seq);
548         unsigned long limit;
549         struct page_counter *counter;
550         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
551
552         idx = MEMFILE_IDX(cft->private);
553         counter = &h_cg->hugepage[idx];
554
555         limit = round_down(PAGE_COUNTER_MAX,
556                            pages_per_huge_page(&hstates[idx]));
557
558         switch (MEMFILE_ATTR(cft->private)) {
559         case RES_RSVD_USAGE:
560                 counter = &h_cg->rsvd_hugepage[idx];
561                 fallthrough;
562         case RES_USAGE:
563                 val = (u64)page_counter_read(counter);
564                 seq_printf(seq, "%llu\n", val * PAGE_SIZE);
565                 break;
566         case RES_RSVD_LIMIT:
567                 counter = &h_cg->rsvd_hugepage[idx];
568                 fallthrough;
569         case RES_LIMIT:
570                 val = (u64)counter->max;
571                 if (val == limit)
572                         seq_puts(seq, "max\n");
573                 else
574                         seq_printf(seq, "%llu\n", val * PAGE_SIZE);
575                 break;
576         default:
577                 BUG();
578         }
579
580         return 0;
581 }
582
583 static DEFINE_MUTEX(hugetlb_limit_mutex);
584
585 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
586                                     char *buf, size_t nbytes, loff_t off,
587                                     const char *max)
588 {
589         int ret, idx;
590         unsigned long nr_pages;
591         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
592         bool rsvd = false;
593
594         if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
595                 return -EINVAL;
596
597         buf = strstrip(buf);
598         ret = page_counter_memparse(buf, max, &nr_pages);
599         if (ret)
600                 return ret;
601
602         idx = MEMFILE_IDX(of_cft(of)->private);
603         nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
604
605         switch (MEMFILE_ATTR(of_cft(of)->private)) {
606         case RES_RSVD_LIMIT:
607                 rsvd = true;
608                 fallthrough;
609         case RES_LIMIT:
610                 mutex_lock(&hugetlb_limit_mutex);
611                 ret = page_counter_set_max(
612                         __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
613                         nr_pages);
614                 mutex_unlock(&hugetlb_limit_mutex);
615                 break;
616         default:
617                 ret = -EINVAL;
618                 break;
619         }
620         return ret ?: nbytes;
621 }
622
623 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
624                                            char *buf, size_t nbytes, loff_t off)
625 {
626         return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
627 }
628
629 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
630                                         char *buf, size_t nbytes, loff_t off)
631 {
632         return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
633 }
634
635 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
636                                     char *buf, size_t nbytes, loff_t off)
637 {
638         int ret = 0;
639         struct page_counter *counter, *rsvd_counter;
640         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
641
642         counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
643         rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
644
645         switch (MEMFILE_ATTR(of_cft(of)->private)) {
646         case RES_MAX_USAGE:
647                 page_counter_reset_watermark(counter);
648                 break;
649         case RES_RSVD_MAX_USAGE:
650                 page_counter_reset_watermark(rsvd_counter);
651                 break;
652         case RES_FAILCNT:
653                 counter->failcnt = 0;
654                 break;
655         case RES_RSVD_FAILCNT:
656                 rsvd_counter->failcnt = 0;
657                 break;
658         default:
659                 ret = -EINVAL;
660                 break;
661         }
662         return ret ?: nbytes;
663 }
664
665 static char *mem_fmt(char *buf, int size, unsigned long hsize)
666 {
667         if (hsize >= SZ_1G)
668                 snprintf(buf, size, "%luGB", hsize / SZ_1G);
669         else if (hsize >= SZ_1M)
670                 snprintf(buf, size, "%luMB", hsize / SZ_1M);
671         else
672                 snprintf(buf, size, "%luKB", hsize / SZ_1K);
673         return buf;
674 }
675
676 static int __hugetlb_events_show(struct seq_file *seq, bool local)
677 {
678         int idx;
679         long max;
680         struct cftype *cft = seq_cft(seq);
681         struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
682
683         idx = MEMFILE_IDX(cft->private);
684
685         if (local)
686                 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
687         else
688                 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
689
690         seq_printf(seq, "max %lu\n", max);
691
692         return 0;
693 }
694
695 static int hugetlb_events_show(struct seq_file *seq, void *v)
696 {
697         return __hugetlb_events_show(seq, false);
698 }
699
700 static int hugetlb_events_local_show(struct seq_file *seq, void *v)
701 {
702         return __hugetlb_events_show(seq, true);
703 }
704
705 static void __init __hugetlb_cgroup_file_dfl_init(int idx)
706 {
707         char buf[32];
708         struct cftype *cft;
709         struct hstate *h = &hstates[idx];
710
711         /* format the size */
712         mem_fmt(buf, sizeof(buf), huge_page_size(h));
713
714         /* Add the limit file */
715         cft = &h->cgroup_files_dfl[0];
716         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
717         cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
718         cft->seq_show = hugetlb_cgroup_read_u64_max;
719         cft->write = hugetlb_cgroup_write_dfl;
720         cft->flags = CFTYPE_NOT_ON_ROOT;
721
722         /* Add the reservation limit file */
723         cft = &h->cgroup_files_dfl[1];
724         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf);
725         cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
726         cft->seq_show = hugetlb_cgroup_read_u64_max;
727         cft->write = hugetlb_cgroup_write_dfl;
728         cft->flags = CFTYPE_NOT_ON_ROOT;
729
730         /* Add the current usage file */
731         cft = &h->cgroup_files_dfl[2];
732         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
733         cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
734         cft->seq_show = hugetlb_cgroup_read_u64_max;
735         cft->flags = CFTYPE_NOT_ON_ROOT;
736
737         /* Add the current reservation usage file */
738         cft = &h->cgroup_files_dfl[3];
739         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf);
740         cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
741         cft->seq_show = hugetlb_cgroup_read_u64_max;
742         cft->flags = CFTYPE_NOT_ON_ROOT;
743
744         /* Add the events file */
745         cft = &h->cgroup_files_dfl[4];
746         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
747         cft->private = MEMFILE_PRIVATE(idx, 0);
748         cft->seq_show = hugetlb_events_show;
749         cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]);
750         cft->flags = CFTYPE_NOT_ON_ROOT;
751
752         /* Add the events.local file */
753         cft = &h->cgroup_files_dfl[5];
754         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
755         cft->private = MEMFILE_PRIVATE(idx, 0);
756         cft->seq_show = hugetlb_events_local_show;
757         cft->file_offset = offsetof(struct hugetlb_cgroup,
758                                     events_local_file[idx]);
759         cft->flags = CFTYPE_NOT_ON_ROOT;
760
761         /* Add the numa stat file */
762         cft = &h->cgroup_files_dfl[6];
763         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
764         cft->private = MEMFILE_PRIVATE(idx, 0);
765         cft->seq_show = hugetlb_cgroup_read_numa_stat;
766         cft->flags = CFTYPE_NOT_ON_ROOT;
767
768         /* NULL terminate the last cft */
769         cft = &h->cgroup_files_dfl[7];
770         memset(cft, 0, sizeof(*cft));
771
772         WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
773                                        h->cgroup_files_dfl));
774 }
775
776 static void __init __hugetlb_cgroup_file_legacy_init(int idx)
777 {
778         char buf[32];
779         struct cftype *cft;
780         struct hstate *h = &hstates[idx];
781
782         /* format the size */
783         mem_fmt(buf, sizeof(buf), huge_page_size(h));
784
785         /* Add the limit file */
786         cft = &h->cgroup_files_legacy[0];
787         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
788         cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
789         cft->read_u64 = hugetlb_cgroup_read_u64;
790         cft->write = hugetlb_cgroup_write_legacy;
791
792         /* Add the reservation limit file */
793         cft = &h->cgroup_files_legacy[1];
794         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf);
795         cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
796         cft->read_u64 = hugetlb_cgroup_read_u64;
797         cft->write = hugetlb_cgroup_write_legacy;
798
799         /* Add the usage file */
800         cft = &h->cgroup_files_legacy[2];
801         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
802         cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
803         cft->read_u64 = hugetlb_cgroup_read_u64;
804
805         /* Add the reservation usage file */
806         cft = &h->cgroup_files_legacy[3];
807         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf);
808         cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
809         cft->read_u64 = hugetlb_cgroup_read_u64;
810
811         /* Add the MAX usage file */
812         cft = &h->cgroup_files_legacy[4];
813         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
814         cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
815         cft->write = hugetlb_cgroup_reset;
816         cft->read_u64 = hugetlb_cgroup_read_u64;
817
818         /* Add the MAX reservation usage file */
819         cft = &h->cgroup_files_legacy[5];
820         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf);
821         cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE);
822         cft->write = hugetlb_cgroup_reset;
823         cft->read_u64 = hugetlb_cgroup_read_u64;
824
825         /* Add the failcntfile */
826         cft = &h->cgroup_files_legacy[6];
827         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
828         cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
829         cft->write = hugetlb_cgroup_reset;
830         cft->read_u64 = hugetlb_cgroup_read_u64;
831
832         /* Add the reservation failcntfile */
833         cft = &h->cgroup_files_legacy[7];
834         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf);
835         cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT);
836         cft->write = hugetlb_cgroup_reset;
837         cft->read_u64 = hugetlb_cgroup_read_u64;
838
839         /* Add the numa stat file */
840         cft = &h->cgroup_files_legacy[8];
841         snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
842         cft->private = MEMFILE_PRIVATE(idx, 1);
843         cft->seq_show = hugetlb_cgroup_read_numa_stat;
844
845         /* NULL terminate the last cft */
846         cft = &h->cgroup_files_legacy[9];
847         memset(cft, 0, sizeof(*cft));
848
849         WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
850                                           h->cgroup_files_legacy));
851 }
852
853 static void __init __hugetlb_cgroup_file_init(int idx)
854 {
855         __hugetlb_cgroup_file_dfl_init(idx);
856         __hugetlb_cgroup_file_legacy_init(idx);
857 }
858
859 void __init hugetlb_cgroup_file_init(void)
860 {
861         struct hstate *h;
862
863         for_each_hstate(h)
864                 __hugetlb_cgroup_file_init(hstate_index(h));
865 }
866
867 /*
868  * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
869  * when we migrate hugepages
870  */
871 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
872 {
873         struct hugetlb_cgroup *h_cg;
874         struct hugetlb_cgroup *h_cg_rsvd;
875         struct hstate *h = folio_hstate(old_folio);
876
877         if (hugetlb_cgroup_disabled())
878                 return;
879
880         spin_lock_irq(&hugetlb_lock);
881         h_cg = hugetlb_cgroup_from_folio(old_folio);
882         h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
883         set_hugetlb_cgroup(old_folio, NULL);
884         set_hugetlb_cgroup_rsvd(old_folio, NULL);
885
886         /* move the h_cg details to new cgroup */
887         set_hugetlb_cgroup(new_folio, h_cg);
888         set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
889         list_move(&new_folio->lru, &h->hugepage_activelist);
890         spin_unlock_irq(&hugetlb_lock);
891         return;
892 }
893
894 static struct cftype hugetlb_files[] = {
895         {} /* terminate */
896 };
897
898 struct cgroup_subsys hugetlb_cgrp_subsys = {
899         .css_alloc      = hugetlb_cgroup_css_alloc,
900         .css_offline    = hugetlb_cgroup_css_offline,
901         .css_free       = hugetlb_cgroup_css_free,
902         .dfl_cftypes    = hugetlb_files,
903         .legacy_cftypes = hugetlb_files,
904 };