rbd: introduce rbd_is_snap()
[linux-2.6-microblaze.git] / mm / pagewalk.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
6
7 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
8                           struct mm_walk *walk)
9 {
10         pte_t *pte;
11         int err = 0;
12         const struct mm_walk_ops *ops = walk->ops;
13
14         pte = pte_offset_map(pmd, addr);
15         for (;;) {
16                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
17                 if (err)
18                        break;
19                 addr += PAGE_SIZE;
20                 if (addr == end)
21                         break;
22                 pte++;
23         }
24
25         pte_unmap(pte);
26         return err;
27 }
28
29 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
30                           struct mm_walk *walk)
31 {
32         pmd_t *pmd;
33         unsigned long next;
34         const struct mm_walk_ops *ops = walk->ops;
35         int err = 0;
36
37         pmd = pmd_offset(pud, addr);
38         do {
39 again:
40                 next = pmd_addr_end(addr, end);
41                 if (pmd_none(*pmd) || !walk->vma) {
42                         if (ops->pte_hole)
43                                 err = ops->pte_hole(addr, next, walk);
44                         if (err)
45                                 break;
46                         continue;
47                 }
48                 /*
49                  * This implies that each ->pmd_entry() handler
50                  * needs to know about pmd_trans_huge() pmds
51                  */
52                 if (ops->pmd_entry)
53                         err = ops->pmd_entry(pmd, addr, next, walk);
54                 if (err)
55                         break;
56
57                 /*
58                  * Check this here so we only break down trans_huge
59                  * pages when we _need_ to
60                  */
61                 if (!ops->pte_entry)
62                         continue;
63
64                 split_huge_pmd(walk->vma, pmd, addr);
65                 if (pmd_trans_unstable(pmd))
66                         goto again;
67                 err = walk_pte_range(pmd, addr, next, walk);
68                 if (err)
69                         break;
70         } while (pmd++, addr = next, addr != end);
71
72         return err;
73 }
74
75 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
76                           struct mm_walk *walk)
77 {
78         pud_t *pud;
79         unsigned long next;
80         const struct mm_walk_ops *ops = walk->ops;
81         int err = 0;
82
83         pud = pud_offset(p4d, addr);
84         do {
85  again:
86                 next = pud_addr_end(addr, end);
87                 if (pud_none(*pud) || !walk->vma) {
88                         if (ops->pte_hole)
89                                 err = ops->pte_hole(addr, next, walk);
90                         if (err)
91                                 break;
92                         continue;
93                 }
94
95                 if (ops->pud_entry) {
96                         spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
97
98                         if (ptl) {
99                                 err = ops->pud_entry(pud, addr, next, walk);
100                                 spin_unlock(ptl);
101                                 if (err)
102                                         break;
103                                 continue;
104                         }
105                 }
106
107                 split_huge_pud(walk->vma, pud, addr);
108                 if (pud_none(*pud))
109                         goto again;
110
111                 if (ops->pmd_entry || ops->pte_entry)
112                         err = walk_pmd_range(pud, addr, next, walk);
113                 if (err)
114                         break;
115         } while (pud++, addr = next, addr != end);
116
117         return err;
118 }
119
120 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
121                           struct mm_walk *walk)
122 {
123         p4d_t *p4d;
124         unsigned long next;
125         const struct mm_walk_ops *ops = walk->ops;
126         int err = 0;
127
128         p4d = p4d_offset(pgd, addr);
129         do {
130                 next = p4d_addr_end(addr, end);
131                 if (p4d_none_or_clear_bad(p4d)) {
132                         if (ops->pte_hole)
133                                 err = ops->pte_hole(addr, next, walk);
134                         if (err)
135                                 break;
136                         continue;
137                 }
138                 if (ops->pmd_entry || ops->pte_entry)
139                         err = walk_pud_range(p4d, addr, next, walk);
140                 if (err)
141                         break;
142         } while (p4d++, addr = next, addr != end);
143
144         return err;
145 }
146
147 static int walk_pgd_range(unsigned long addr, unsigned long end,
148                           struct mm_walk *walk)
149 {
150         pgd_t *pgd;
151         unsigned long next;
152         const struct mm_walk_ops *ops = walk->ops;
153         int err = 0;
154
155         pgd = pgd_offset(walk->mm, addr);
156         do {
157                 next = pgd_addr_end(addr, end);
158                 if (pgd_none_or_clear_bad(pgd)) {
159                         if (ops->pte_hole)
160                                 err = ops->pte_hole(addr, next, walk);
161                         if (err)
162                                 break;
163                         continue;
164                 }
165                 if (ops->pmd_entry || ops->pte_entry)
166                         err = walk_p4d_range(pgd, addr, next, walk);
167                 if (err)
168                         break;
169         } while (pgd++, addr = next, addr != end);
170
171         return err;
172 }
173
174 #ifdef CONFIG_HUGETLB_PAGE
175 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
176                                        unsigned long end)
177 {
178         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
179         return boundary < end ? boundary : end;
180 }
181
182 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
183                               struct mm_walk *walk)
184 {
185         struct vm_area_struct *vma = walk->vma;
186         struct hstate *h = hstate_vma(vma);
187         unsigned long next;
188         unsigned long hmask = huge_page_mask(h);
189         unsigned long sz = huge_page_size(h);
190         pte_t *pte;
191         const struct mm_walk_ops *ops = walk->ops;
192         int err = 0;
193
194         do {
195                 next = hugetlb_entry_end(h, addr, end);
196                 pte = huge_pte_offset(walk->mm, addr & hmask, sz);
197
198                 if (pte)
199                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
200                 else if (ops->pte_hole)
201                         err = ops->pte_hole(addr, next, walk);
202
203                 if (err)
204                         break;
205         } while (addr = next, addr != end);
206
207         return err;
208 }
209
210 #else /* CONFIG_HUGETLB_PAGE */
211 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
212                               struct mm_walk *walk)
213 {
214         return 0;
215 }
216
217 #endif /* CONFIG_HUGETLB_PAGE */
218
219 /*
220  * Decide whether we really walk over the current vma on [@start, @end)
221  * or skip it via the returned value. Return 0 if we do walk over the
222  * current vma, and return 1 if we skip the vma. Negative values means
223  * error, where we abort the current walk.
224  */
225 static int walk_page_test(unsigned long start, unsigned long end,
226                         struct mm_walk *walk)
227 {
228         struct vm_area_struct *vma = walk->vma;
229         const struct mm_walk_ops *ops = walk->ops;
230
231         if (ops->test_walk)
232                 return ops->test_walk(start, end, walk);
233
234         /*
235          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
236          * range, so we don't walk over it as we do for normal vmas. However,
237          * Some callers are interested in handling hole range and they don't
238          * want to just ignore any single address range. Such users certainly
239          * define their ->pte_hole() callbacks, so let's delegate them to handle
240          * vma(VM_PFNMAP).
241          */
242         if (vma->vm_flags & VM_PFNMAP) {
243                 int err = 1;
244                 if (ops->pte_hole)
245                         err = ops->pte_hole(start, end, walk);
246                 return err ? err : 1;
247         }
248         return 0;
249 }
250
251 static int __walk_page_range(unsigned long start, unsigned long end,
252                         struct mm_walk *walk)
253 {
254         int err = 0;
255         struct vm_area_struct *vma = walk->vma;
256
257         if (vma && is_vm_hugetlb_page(vma)) {
258                 if (walk->ops->hugetlb_entry)
259                         err = walk_hugetlb_range(start, end, walk);
260         } else
261                 err = walk_pgd_range(start, end, walk);
262
263         return err;
264 }
265
266 /**
267  * walk_page_range - walk page table with caller specific callbacks
268  * @mm:         mm_struct representing the target process of page table walk
269  * @start:      start address of the virtual address range
270  * @end:        end address of the virtual address range
271  * @ops:        operation to call during the walk
272  * @private:    private data for callbacks' usage
273  *
274  * Recursively walk the page table tree of the process represented by @mm
275  * within the virtual address range [@start, @end). During walking, we can do
276  * some caller-specific works for each entry, by setting up pmd_entry(),
277  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
278  * callbacks, the associated entries/pages are just ignored.
279  * The return values of these callbacks are commonly defined like below:
280  *
281  *  - 0  : succeeded to handle the current entry, and if you don't reach the
282  *         end address yet, continue to walk.
283  *  - >0 : succeeded to handle the current entry, and return to the caller
284  *         with caller specific value.
285  *  - <0 : failed to handle the current entry, and return to the caller
286  *         with error code.
287  *
288  * Before starting to walk page table, some callers want to check whether
289  * they really want to walk over the current vma, typically by checking
290  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
291  * purpose.
292  *
293  * struct mm_walk keeps current values of some common data like vma and pmd,
294  * which are useful for the access from callbacks. If you want to pass some
295  * caller-specific data to callbacks, @private should be helpful.
296  *
297  * Locking:
298  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
299  *   because these function traverse vma list and/or access to vma's data.
300  */
301 int walk_page_range(struct mm_struct *mm, unsigned long start,
302                 unsigned long end, const struct mm_walk_ops *ops,
303                 void *private)
304 {
305         int err = 0;
306         unsigned long next;
307         struct vm_area_struct *vma;
308         struct mm_walk walk = {
309                 .ops            = ops,
310                 .mm             = mm,
311                 .private        = private,
312         };
313
314         if (start >= end)
315                 return -EINVAL;
316
317         if (!walk.mm)
318                 return -EINVAL;
319
320         lockdep_assert_held(&walk.mm->mmap_sem);
321
322         vma = find_vma(walk.mm, start);
323         do {
324                 if (!vma) { /* after the last vma */
325                         walk.vma = NULL;
326                         next = end;
327                 } else if (start < vma->vm_start) { /* outside vma */
328                         walk.vma = NULL;
329                         next = min(end, vma->vm_start);
330                 } else { /* inside vma */
331                         walk.vma = vma;
332                         next = min(end, vma->vm_end);
333                         vma = vma->vm_next;
334
335                         err = walk_page_test(start, next, &walk);
336                         if (err > 0) {
337                                 /*
338                                  * positive return values are purely for
339                                  * controlling the pagewalk, so should never
340                                  * be passed to the callers.
341                                  */
342                                 err = 0;
343                                 continue;
344                         }
345                         if (err < 0)
346                                 break;
347                 }
348                 if (walk.vma || walk.ops->pte_hole)
349                         err = __walk_page_range(start, next, &walk);
350                 if (err)
351                         break;
352         } while (start = next, start < end);
353         return err;
354 }
355
356 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
357                 void *private)
358 {
359         struct mm_walk walk = {
360                 .ops            = ops,
361                 .mm             = vma->vm_mm,
362                 .vma            = vma,
363                 .private        = private,
364         };
365         int err;
366
367         if (!walk.mm)
368                 return -EINVAL;
369
370         lockdep_assert_held(&walk.mm->mmap_sem);
371
372         err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
373         if (err > 0)
374                 return 0;
375         if (err < 0)
376                 return err;
377         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
378 }