Merge tag 'sound-5.5-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai...
[linux-2.6-microblaze.git] / mm / pagewalk.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
6
7 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
8                           struct mm_walk *walk)
9 {
10         pte_t *pte;
11         int err = 0;
12         const struct mm_walk_ops *ops = walk->ops;
13         spinlock_t *ptl;
14
15         pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
16         for (;;) {
17                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
18                 if (err)
19                        break;
20                 addr += PAGE_SIZE;
21                 if (addr == end)
22                         break;
23                 pte++;
24         }
25
26         pte_unmap_unlock(pte, ptl);
27         return err;
28 }
29
30 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
31                           struct mm_walk *walk)
32 {
33         pmd_t *pmd;
34         unsigned long next;
35         const struct mm_walk_ops *ops = walk->ops;
36         int err = 0;
37
38         pmd = pmd_offset(pud, addr);
39         do {
40 again:
41                 next = pmd_addr_end(addr, end);
42                 if (pmd_none(*pmd) || !walk->vma) {
43                         if (ops->pte_hole)
44                                 err = ops->pte_hole(addr, next, walk);
45                         if (err)
46                                 break;
47                         continue;
48                 }
49                 /*
50                  * This implies that each ->pmd_entry() handler
51                  * needs to know about pmd_trans_huge() pmds
52                  */
53                 if (ops->pmd_entry)
54                         err = ops->pmd_entry(pmd, addr, next, walk);
55                 if (err)
56                         break;
57
58                 /*
59                  * Check this here so we only break down trans_huge
60                  * pages when we _need_ to
61                  */
62                 if (!ops->pte_entry)
63                         continue;
64
65                 split_huge_pmd(walk->vma, pmd, addr);
66                 if (pmd_trans_unstable(pmd))
67                         goto again;
68                 err = walk_pte_range(pmd, addr, next, walk);
69                 if (err)
70                         break;
71         } while (pmd++, addr = next, addr != end);
72
73         return err;
74 }
75
76 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
77                           struct mm_walk *walk)
78 {
79         pud_t *pud;
80         unsigned long next;
81         const struct mm_walk_ops *ops = walk->ops;
82         int err = 0;
83
84         pud = pud_offset(p4d, addr);
85         do {
86  again:
87                 next = pud_addr_end(addr, end);
88                 if (pud_none(*pud) || !walk->vma) {
89                         if (ops->pte_hole)
90                                 err = ops->pte_hole(addr, next, walk);
91                         if (err)
92                                 break;
93                         continue;
94                 }
95
96                 if (ops->pud_entry) {
97                         spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
98
99                         if (ptl) {
100                                 err = ops->pud_entry(pud, addr, next, walk);
101                                 spin_unlock(ptl);
102                                 if (err)
103                                         break;
104                                 continue;
105                         }
106                 }
107
108                 split_huge_pud(walk->vma, pud, addr);
109                 if (pud_none(*pud))
110                         goto again;
111
112                 if (ops->pmd_entry || ops->pte_entry)
113                         err = walk_pmd_range(pud, addr, next, walk);
114                 if (err)
115                         break;
116         } while (pud++, addr = next, addr != end);
117
118         return err;
119 }
120
121 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
122                           struct mm_walk *walk)
123 {
124         p4d_t *p4d;
125         unsigned long next;
126         const struct mm_walk_ops *ops = walk->ops;
127         int err = 0;
128
129         p4d = p4d_offset(pgd, addr);
130         do {
131                 next = p4d_addr_end(addr, end);
132                 if (p4d_none_or_clear_bad(p4d)) {
133                         if (ops->pte_hole)
134                                 err = ops->pte_hole(addr, next, walk);
135                         if (err)
136                                 break;
137                         continue;
138                 }
139                 if (ops->pmd_entry || ops->pte_entry)
140                         err = walk_pud_range(p4d, addr, next, walk);
141                 if (err)
142                         break;
143         } while (p4d++, addr = next, addr != end);
144
145         return err;
146 }
147
148 static int walk_pgd_range(unsigned long addr, unsigned long end,
149                           struct mm_walk *walk)
150 {
151         pgd_t *pgd;
152         unsigned long next;
153         const struct mm_walk_ops *ops = walk->ops;
154         int err = 0;
155
156         pgd = pgd_offset(walk->mm, addr);
157         do {
158                 next = pgd_addr_end(addr, end);
159                 if (pgd_none_or_clear_bad(pgd)) {
160                         if (ops->pte_hole)
161                                 err = ops->pte_hole(addr, next, walk);
162                         if (err)
163                                 break;
164                         continue;
165                 }
166                 if (ops->pmd_entry || ops->pte_entry)
167                         err = walk_p4d_range(pgd, addr, next, walk);
168                 if (err)
169                         break;
170         } while (pgd++, addr = next, addr != end);
171
172         return err;
173 }
174
175 #ifdef CONFIG_HUGETLB_PAGE
176 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
177                                        unsigned long end)
178 {
179         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
180         return boundary < end ? boundary : end;
181 }
182
183 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
184                               struct mm_walk *walk)
185 {
186         struct vm_area_struct *vma = walk->vma;
187         struct hstate *h = hstate_vma(vma);
188         unsigned long next;
189         unsigned long hmask = huge_page_mask(h);
190         unsigned long sz = huge_page_size(h);
191         pte_t *pte;
192         const struct mm_walk_ops *ops = walk->ops;
193         int err = 0;
194
195         do {
196                 next = hugetlb_entry_end(h, addr, end);
197                 pte = huge_pte_offset(walk->mm, addr & hmask, sz);
198
199                 if (pte)
200                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
201                 else if (ops->pte_hole)
202                         err = ops->pte_hole(addr, next, walk);
203
204                 if (err)
205                         break;
206         } while (addr = next, addr != end);
207
208         return err;
209 }
210
211 #else /* CONFIG_HUGETLB_PAGE */
212 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
213                               struct mm_walk *walk)
214 {
215         return 0;
216 }
217
218 #endif /* CONFIG_HUGETLB_PAGE */
219
220 /*
221  * Decide whether we really walk over the current vma on [@start, @end)
222  * or skip it via the returned value. Return 0 if we do walk over the
223  * current vma, and return 1 if we skip the vma. Negative values means
224  * error, where we abort the current walk.
225  */
226 static int walk_page_test(unsigned long start, unsigned long end,
227                         struct mm_walk *walk)
228 {
229         struct vm_area_struct *vma = walk->vma;
230         const struct mm_walk_ops *ops = walk->ops;
231
232         if (ops->test_walk)
233                 return ops->test_walk(start, end, walk);
234
235         /*
236          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
237          * range, so we don't walk over it as we do for normal vmas. However,
238          * Some callers are interested in handling hole range and they don't
239          * want to just ignore any single address range. Such users certainly
240          * define their ->pte_hole() callbacks, so let's delegate them to handle
241          * vma(VM_PFNMAP).
242          */
243         if (vma->vm_flags & VM_PFNMAP) {
244                 int err = 1;
245                 if (ops->pte_hole)
246                         err = ops->pte_hole(start, end, walk);
247                 return err ? err : 1;
248         }
249         return 0;
250 }
251
252 static int __walk_page_range(unsigned long start, unsigned long end,
253                         struct mm_walk *walk)
254 {
255         int err = 0;
256         struct vm_area_struct *vma = walk->vma;
257         const struct mm_walk_ops *ops = walk->ops;
258
259         if (vma && ops->pre_vma) {
260                 err = ops->pre_vma(start, end, walk);
261                 if (err)
262                         return err;
263         }
264
265         if (vma && is_vm_hugetlb_page(vma)) {
266                 if (ops->hugetlb_entry)
267                         err = walk_hugetlb_range(start, end, walk);
268         } else
269                 err = walk_pgd_range(start, end, walk);
270
271         if (vma && ops->post_vma)
272                 ops->post_vma(walk);
273
274         return err;
275 }
276
277 /**
278  * walk_page_range - walk page table with caller specific callbacks
279  * @mm:         mm_struct representing the target process of page table walk
280  * @start:      start address of the virtual address range
281  * @end:        end address of the virtual address range
282  * @ops:        operation to call during the walk
283  * @private:    private data for callbacks' usage
284  *
285  * Recursively walk the page table tree of the process represented by @mm
286  * within the virtual address range [@start, @end). During walking, we can do
287  * some caller-specific works for each entry, by setting up pmd_entry(),
288  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
289  * callbacks, the associated entries/pages are just ignored.
290  * The return values of these callbacks are commonly defined like below:
291  *
292  *  - 0  : succeeded to handle the current entry, and if you don't reach the
293  *         end address yet, continue to walk.
294  *  - >0 : succeeded to handle the current entry, and return to the caller
295  *         with caller specific value.
296  *  - <0 : failed to handle the current entry, and return to the caller
297  *         with error code.
298  *
299  * Before starting to walk page table, some callers want to check whether
300  * they really want to walk over the current vma, typically by checking
301  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
302  * purpose.
303  *
304  * If operations need to be staged before and committed after a vma is walked,
305  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
306  * since it is intended to handle commit-type operations, can't return any
307  * errors.
308  *
309  * struct mm_walk keeps current values of some common data like vma and pmd,
310  * which are useful for the access from callbacks. If you want to pass some
311  * caller-specific data to callbacks, @private should be helpful.
312  *
313  * Locking:
314  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
315  *   because these function traverse vma list and/or access to vma's data.
316  */
317 int walk_page_range(struct mm_struct *mm, unsigned long start,
318                 unsigned long end, const struct mm_walk_ops *ops,
319                 void *private)
320 {
321         int err = 0;
322         unsigned long next;
323         struct vm_area_struct *vma;
324         struct mm_walk walk = {
325                 .ops            = ops,
326                 .mm             = mm,
327                 .private        = private,
328         };
329
330         if (start >= end)
331                 return -EINVAL;
332
333         if (!walk.mm)
334                 return -EINVAL;
335
336         lockdep_assert_held(&walk.mm->mmap_sem);
337
338         vma = find_vma(walk.mm, start);
339         do {
340                 if (!vma) { /* after the last vma */
341                         walk.vma = NULL;
342                         next = end;
343                 } else if (start < vma->vm_start) { /* outside vma */
344                         walk.vma = NULL;
345                         next = min(end, vma->vm_start);
346                 } else { /* inside vma */
347                         walk.vma = vma;
348                         next = min(end, vma->vm_end);
349                         vma = vma->vm_next;
350
351                         err = walk_page_test(start, next, &walk);
352                         if (err > 0) {
353                                 /*
354                                  * positive return values are purely for
355                                  * controlling the pagewalk, so should never
356                                  * be passed to the callers.
357                                  */
358                                 err = 0;
359                                 continue;
360                         }
361                         if (err < 0)
362                                 break;
363                 }
364                 if (walk.vma || walk.ops->pte_hole)
365                         err = __walk_page_range(start, next, &walk);
366                 if (err)
367                         break;
368         } while (start = next, start < end);
369         return err;
370 }
371
372 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
373                 void *private)
374 {
375         struct mm_walk walk = {
376                 .ops            = ops,
377                 .mm             = vma->vm_mm,
378                 .vma            = vma,
379                 .private        = private,
380         };
381         int err;
382
383         if (!walk.mm)
384                 return -EINVAL;
385
386         lockdep_assert_held(&walk.mm->mmap_sem);
387
388         err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
389         if (err > 0)
390                 return 0;
391         if (err < 0)
392                 return err;
393         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
394 }
395
396 /**
397  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
398  * @mapping: Pointer to the struct address_space
399  * @first_index: First page offset in the address_space
400  * @nr: Number of incremental page offsets to cover
401  * @ops:        operation to call during the walk
402  * @private:    private data for callbacks' usage
403  *
404  * This function walks all memory areas mapped into a struct address_space.
405  * The walk is limited to only the given page-size index range, but if
406  * the index boundaries cross a huge page-table entry, that entry will be
407  * included.
408  *
409  * Also see walk_page_range() for additional information.
410  *
411  * Locking:
412  *   This function can't require that the struct mm_struct::mmap_sem is held,
413  *   since @mapping may be mapped by multiple processes. Instead
414  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
415  *   callbacks, and it's up tho the caller to ensure that the
416  *   struct mm_struct::mmap_sem is not needed.
417  *
418  *   Also this means that a caller can't rely on the struct
419  *   vm_area_struct::vm_flags to be constant across a call,
420  *   except for immutable flags. Callers requiring this shouldn't use
421  *   this function.
422  *
423  * Return: 0 on success, negative error code on failure, positive number on
424  * caller defined premature termination.
425  */
426 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
427                       pgoff_t nr, const struct mm_walk_ops *ops,
428                       void *private)
429 {
430         struct mm_walk walk = {
431                 .ops            = ops,
432                 .private        = private,
433         };
434         struct vm_area_struct *vma;
435         pgoff_t vba, vea, cba, cea;
436         unsigned long start_addr, end_addr;
437         int err = 0;
438
439         lockdep_assert_held(&mapping->i_mmap_rwsem);
440         vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
441                                   first_index + nr - 1) {
442                 /* Clip to the vma */
443                 vba = vma->vm_pgoff;
444                 vea = vba + vma_pages(vma);
445                 cba = first_index;
446                 cba = max(cba, vba);
447                 cea = first_index + nr;
448                 cea = min(cea, vea);
449
450                 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
451                 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
452                 if (start_addr >= end_addr)
453                         continue;
454
455                 walk.vma = vma;
456                 walk.mm = vma->vm_mm;
457
458                 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
459                 if (err > 0) {
460                         err = 0;
461                         break;
462                 } else if (err < 0)
463                         break;
464
465                 err = __walk_page_range(start_addr, end_addr, &walk);
466                 if (err)
467                         break;
468         }
469
470         return err;
471 }