Merge tag 'fscache-next-20210829' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / fs / btrfs / subpage.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/slab.h>
4 #include "ctree.h"
5 #include "subpage.h"
6 #include "btrfs_inode.h"
7
8 /*
9  * Subpage (sectorsize < PAGE_SIZE) support overview:
10  *
11  * Limitations:
12  *
13  * - Only support 64K page size for now
14  *   This is to make metadata handling easier, as 64K page would ensure
15  *   all nodesize would fit inside one page, thus we don't need to handle
16  *   cases where a tree block crosses several pages.
17  *
18  * - Only metadata read-write for now
19  *   The data read-write part is in development.
20  *
21  * - Metadata can't cross 64K page boundary
22  *   btrfs-progs and kernel have done that for a while, thus only ancient
23  *   filesystems could have such problem.  For such case, do a graceful
24  *   rejection.
25  *
26  * Special behavior:
27  *
28  * - Metadata
29  *   Metadata read is fully supported.
30  *   Meaning when reading one tree block will only trigger the read for the
31  *   needed range, other unrelated range in the same page will not be touched.
32  *
33  *   Metadata write support is partial.
34  *   The writeback is still for the full page, but we will only submit
35  *   the dirty extent buffers in the page.
36  *
37  *   This means, if we have a metadata page like this:
38  *
39  *   Page offset
40  *   0         16K         32K         48K        64K
41  *   |/////////|           |///////////|
42  *        \- Tree block A        \- Tree block B
43  *
44  *   Even if we just want to writeback tree block A, we will also writeback
45  *   tree block B if it's also dirty.
46  *
47  *   This may cause extra metadata writeback which results more COW.
48  *
49  * Implementation:
50  *
51  * - Common
52  *   Both metadata and data will use a new structure, btrfs_subpage, to
53  *   record the status of each sector inside a page.  This provides the extra
54  *   granularity needed.
55  *
56  * - Metadata
57  *   Since we have multiple tree blocks inside one page, we can't rely on page
58  *   locking anymore, or we will have greatly reduced concurrency or even
59  *   deadlocks (hold one tree lock while trying to lock another tree lock in
60  *   the same page).
61  *
62  *   Thus for metadata locking, subpage support relies on io_tree locking only.
63  *   This means a slightly higher tree locking latency.
64  */
65
66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
67                          struct page *page, enum btrfs_subpage_type type)
68 {
69         struct btrfs_subpage *subpage = NULL;
70         int ret;
71
72         /*
73          * We have cases like a dummy extent buffer page, which is not mappped
74          * and doesn't need to be locked.
75          */
76         if (page->mapping)
77                 ASSERT(PageLocked(page));
78         /* Either not subpage, or the page already has private attached */
79         if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
80                 return 0;
81
82         ret = btrfs_alloc_subpage(fs_info, &subpage, type);
83         if (ret < 0)
84                 return ret;
85         attach_page_private(page, subpage);
86         return 0;
87 }
88
89 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
90                           struct page *page)
91 {
92         struct btrfs_subpage *subpage;
93
94         /* Either not subpage, or already detached */
95         if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
96                 return;
97
98         subpage = (struct btrfs_subpage *)detach_page_private(page);
99         ASSERT(subpage);
100         btrfs_free_subpage(subpage);
101 }
102
103 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
104                         struct btrfs_subpage **ret,
105                         enum btrfs_subpage_type type)
106 {
107         if (fs_info->sectorsize == PAGE_SIZE)
108                 return 0;
109
110         *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
111         if (!*ret)
112                 return -ENOMEM;
113         spin_lock_init(&(*ret)->lock);
114         if (type == BTRFS_SUBPAGE_METADATA) {
115                 atomic_set(&(*ret)->eb_refs, 0);
116         } else {
117                 atomic_set(&(*ret)->readers, 0);
118                 atomic_set(&(*ret)->writers, 0);
119         }
120         return 0;
121 }
122
123 void btrfs_free_subpage(struct btrfs_subpage *subpage)
124 {
125         kfree(subpage);
126 }
127
128 /*
129  * Increase the eb_refs of current subpage.
130  *
131  * This is important for eb allocation, to prevent race with last eb freeing
132  * of the same page.
133  * With the eb_refs increased before the eb inserted into radix tree,
134  * detach_extent_buffer_page() won't detach the page private while we're still
135  * allocating the extent buffer.
136  */
137 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
138                             struct page *page)
139 {
140         struct btrfs_subpage *subpage;
141
142         if (fs_info->sectorsize == PAGE_SIZE)
143                 return;
144
145         ASSERT(PagePrivate(page) && page->mapping);
146         lockdep_assert_held(&page->mapping->private_lock);
147
148         subpage = (struct btrfs_subpage *)page->private;
149         atomic_inc(&subpage->eb_refs);
150 }
151
152 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
153                             struct page *page)
154 {
155         struct btrfs_subpage *subpage;
156
157         if (fs_info->sectorsize == PAGE_SIZE)
158                 return;
159
160         ASSERT(PagePrivate(page) && page->mapping);
161         lockdep_assert_held(&page->mapping->private_lock);
162
163         subpage = (struct btrfs_subpage *)page->private;
164         ASSERT(atomic_read(&subpage->eb_refs));
165         atomic_dec(&subpage->eb_refs);
166 }
167
168 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
169                 struct page *page, u64 start, u32 len)
170 {
171         /* Basic checks */
172         ASSERT(PagePrivate(page) && page->private);
173         ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
174                IS_ALIGNED(len, fs_info->sectorsize));
175         /*
176          * The range check only works for mapped page, we can still have
177          * unmapped page like dummy extent buffer pages.
178          */
179         if (page->mapping)
180                 ASSERT(page_offset(page) <= start &&
181                        start + len <= page_offset(page) + PAGE_SIZE);
182 }
183
184 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
185                 struct page *page, u64 start, u32 len)
186 {
187         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
188         const int nbits = len >> fs_info->sectorsize_bits;
189
190         btrfs_subpage_assert(fs_info, page, start, len);
191
192         atomic_add(nbits, &subpage->readers);
193 }
194
195 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
196                 struct page *page, u64 start, u32 len)
197 {
198         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
199         const int nbits = len >> fs_info->sectorsize_bits;
200         bool is_data;
201         bool last;
202
203         btrfs_subpage_assert(fs_info, page, start, len);
204         is_data = is_data_inode(page->mapping->host);
205         ASSERT(atomic_read(&subpage->readers) >= nbits);
206         last = atomic_sub_and_test(nbits, &subpage->readers);
207
208         /*
209          * For data we need to unlock the page if the last read has finished.
210          *
211          * And please don't replace @last with atomic_sub_and_test() call
212          * inside if () condition.
213          * As we want the atomic_sub_and_test() to be always executed.
214          */
215         if (is_data && last)
216                 unlock_page(page);
217 }
218
219 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
220 {
221         u64 orig_start = *start;
222         u32 orig_len = *len;
223
224         *start = max_t(u64, page_offset(page), orig_start);
225         *len = min_t(u64, page_offset(page) + PAGE_SIZE,
226                      orig_start + orig_len) - *start;
227 }
228
229 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
230                 struct page *page, u64 start, u32 len)
231 {
232         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
233         const int nbits = (len >> fs_info->sectorsize_bits);
234         int ret;
235
236         btrfs_subpage_assert(fs_info, page, start, len);
237
238         ASSERT(atomic_read(&subpage->readers) == 0);
239         ret = atomic_add_return(nbits, &subpage->writers);
240         ASSERT(ret == nbits);
241 }
242
243 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
244                 struct page *page, u64 start, u32 len)
245 {
246         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
247         const int nbits = (len >> fs_info->sectorsize_bits);
248
249         btrfs_subpage_assert(fs_info, page, start, len);
250
251         ASSERT(atomic_read(&subpage->writers) >= nbits);
252         return atomic_sub_and_test(nbits, &subpage->writers);
253 }
254
255 /*
256  * Lock a page for delalloc page writeback.
257  *
258  * Return -EAGAIN if the page is not properly initialized.
259  * Return 0 with the page locked, and writer counter updated.
260  *
261  * Even with 0 returned, the page still need extra check to make sure
262  * it's really the correct page, as the caller is using
263  * find_get_pages_contig(), which can race with page invalidating.
264  */
265 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
266                 struct page *page, u64 start, u32 len)
267 {
268         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
269                 lock_page(page);
270                 return 0;
271         }
272         lock_page(page);
273         if (!PagePrivate(page) || !page->private) {
274                 unlock_page(page);
275                 return -EAGAIN;
276         }
277         btrfs_subpage_clamp_range(page, &start, &len);
278         btrfs_subpage_start_writer(fs_info, page, start, len);
279         return 0;
280 }
281
282 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
283                 struct page *page, u64 start, u32 len)
284 {
285         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
286                 return unlock_page(page);
287         btrfs_subpage_clamp_range(page, &start, &len);
288         if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
289                 unlock_page(page);
290 }
291
292 /*
293  * Convert the [start, start + len) range into a u16 bitmap
294  *
295  * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
296  */
297 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
298                 struct page *page, u64 start, u32 len)
299 {
300         const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
301         const int nbits = len >> fs_info->sectorsize_bits;
302
303         btrfs_subpage_assert(fs_info, page, start, len);
304
305         /*
306          * Here nbits can be 16, thus can go beyond u16 range. We make the
307          * first left shift to be calculate in unsigned long (at least u32),
308          * then truncate the result to u16.
309          */
310         return (u16)(((1UL << nbits) - 1) << bit_start);
311 }
312
313 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
314                 struct page *page, u64 start, u32 len)
315 {
316         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
317         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
318         unsigned long flags;
319
320         spin_lock_irqsave(&subpage->lock, flags);
321         subpage->uptodate_bitmap |= tmp;
322         if (subpage->uptodate_bitmap == U16_MAX)
323                 SetPageUptodate(page);
324         spin_unlock_irqrestore(&subpage->lock, flags);
325 }
326
327 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
328                 struct page *page, u64 start, u32 len)
329 {
330         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
331         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
332         unsigned long flags;
333
334         spin_lock_irqsave(&subpage->lock, flags);
335         subpage->uptodate_bitmap &= ~tmp;
336         ClearPageUptodate(page);
337         spin_unlock_irqrestore(&subpage->lock, flags);
338 }
339
340 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
341                 struct page *page, u64 start, u32 len)
342 {
343         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
344         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
345         unsigned long flags;
346
347         spin_lock_irqsave(&subpage->lock, flags);
348         subpage->error_bitmap |= tmp;
349         SetPageError(page);
350         spin_unlock_irqrestore(&subpage->lock, flags);
351 }
352
353 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
354                 struct page *page, u64 start, u32 len)
355 {
356         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
357         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
358         unsigned long flags;
359
360         spin_lock_irqsave(&subpage->lock, flags);
361         subpage->error_bitmap &= ~tmp;
362         if (subpage->error_bitmap == 0)
363                 ClearPageError(page);
364         spin_unlock_irqrestore(&subpage->lock, flags);
365 }
366
367 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
368                 struct page *page, u64 start, u32 len)
369 {
370         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
371         u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
372         unsigned long flags;
373
374         spin_lock_irqsave(&subpage->lock, flags);
375         subpage->dirty_bitmap |= tmp;
376         spin_unlock_irqrestore(&subpage->lock, flags);
377         set_page_dirty(page);
378 }
379
380 /*
381  * Extra clear_and_test function for subpage dirty bitmap.
382  *
383  * Return true if we're the last bits in the dirty_bitmap and clear the
384  * dirty_bitmap.
385  * Return false otherwise.
386  *
387  * NOTE: Callers should manually clear page dirty for true case, as we have
388  * extra handling for tree blocks.
389  */
390 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
391                 struct page *page, u64 start, u32 len)
392 {
393         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
394         u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
395         unsigned long flags;
396         bool last = false;
397
398         spin_lock_irqsave(&subpage->lock, flags);
399         subpage->dirty_bitmap &= ~tmp;
400         if (subpage->dirty_bitmap == 0)
401                 last = true;
402         spin_unlock_irqrestore(&subpage->lock, flags);
403         return last;
404 }
405
406 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
407                 struct page *page, u64 start, u32 len)
408 {
409         bool last;
410
411         last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
412         if (last)
413                 clear_page_dirty_for_io(page);
414 }
415
416 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
417                 struct page *page, u64 start, u32 len)
418 {
419         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
420         u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
421         unsigned long flags;
422
423         spin_lock_irqsave(&subpage->lock, flags);
424         subpage->writeback_bitmap |= tmp;
425         set_page_writeback(page);
426         spin_unlock_irqrestore(&subpage->lock, flags);
427 }
428
429 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
430                 struct page *page, u64 start, u32 len)
431 {
432         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
433         u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
434         unsigned long flags;
435
436         spin_lock_irqsave(&subpage->lock, flags);
437         subpage->writeback_bitmap &= ~tmp;
438         if (subpage->writeback_bitmap == 0) {
439                 ASSERT(PageWriteback(page));
440                 end_page_writeback(page);
441         }
442         spin_unlock_irqrestore(&subpage->lock, flags);
443 }
444
445 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
446                 struct page *page, u64 start, u32 len)
447 {
448         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
449         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
450         unsigned long flags;
451
452         spin_lock_irqsave(&subpage->lock, flags);
453         subpage->ordered_bitmap |= tmp;
454         SetPageOrdered(page);
455         spin_unlock_irqrestore(&subpage->lock, flags);
456 }
457
458 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
459                 struct page *page, u64 start, u32 len)
460 {
461         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
462         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
463         unsigned long flags;
464
465         spin_lock_irqsave(&subpage->lock, flags);
466         subpage->ordered_bitmap &= ~tmp;
467         if (subpage->ordered_bitmap == 0)
468                 ClearPageOrdered(page);
469         spin_unlock_irqrestore(&subpage->lock, flags);
470 }
471 /*
472  * Unlike set/clear which is dependent on each page status, for test all bits
473  * are tested in the same way.
474  */
475 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name)                           \
476 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,     \
477                 struct page *page, u64 start, u32 len)                  \
478 {                                                                       \
479         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
480         const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
481         unsigned long flags;                                            \
482         bool ret;                                                       \
483                                                                         \
484         spin_lock_irqsave(&subpage->lock, flags);                       \
485         ret = ((subpage->name##_bitmap & tmp) == tmp);                  \
486         spin_unlock_irqrestore(&subpage->lock, flags);                  \
487         return ret;                                                     \
488 }
489 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
490 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
492 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
493 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
494
495 /*
496  * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
497  * in.  We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
498  * back to regular sectorsize branch.
499  */
500 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func,  \
501                                test_page_func)                          \
502 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info,         \
503                 struct page *page, u64 start, u32 len)                  \
504 {                                                                       \
505         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {   \
506                 set_page_func(page);                                    \
507                 return;                                                 \
508         }                                                               \
509         btrfs_subpage_set_##name(fs_info, page, start, len);            \
510 }                                                                       \
511 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info,       \
512                 struct page *page, u64 start, u32 len)                  \
513 {                                                                       \
514         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {   \
515                 clear_page_func(page);                                  \
516                 return;                                                 \
517         }                                                               \
518         btrfs_subpage_clear_##name(fs_info, page, start, len);          \
519 }                                                                       \
520 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info,        \
521                 struct page *page, u64 start, u32 len)                  \
522 {                                                                       \
523         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)     \
524                 return test_page_func(page);                            \
525         return btrfs_subpage_test_##name(fs_info, page, start, len);    \
526 }                                                                       \
527 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info,   \
528                 struct page *page, u64 start, u32 len)                  \
529 {                                                                       \
530         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {   \
531                 set_page_func(page);                                    \
532                 return;                                                 \
533         }                                                               \
534         btrfs_subpage_clamp_range(page, &start, &len);                  \
535         btrfs_subpage_set_##name(fs_info, page, start, len);            \
536 }                                                                       \
537 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
538                 struct page *page, u64 start, u32 len)                  \
539 {                                                                       \
540         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {   \
541                 clear_page_func(page);                                  \
542                 return;                                                 \
543         }                                                               \
544         btrfs_subpage_clamp_range(page, &start, &len);                  \
545         btrfs_subpage_clear_##name(fs_info, page, start, len);          \
546 }                                                                       \
547 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info,  \
548                 struct page *page, u64 start, u32 len)                  \
549 {                                                                       \
550         if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)     \
551                 return test_page_func(page);                            \
552         btrfs_subpage_clamp_range(page, &start, &len);                  \
553         return btrfs_subpage_test_##name(fs_info, page, start, len);    \
554 }
555 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
556                          PageUptodate);
557 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
558 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
559                          PageDirty);
560 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
561                          PageWriteback);
562 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
563                          PageOrdered);
564
565 /*
566  * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
567  * is cleared.
568  */
569 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
570                                  struct page *page)
571 {
572         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
573
574         if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
575                 return;
576
577         ASSERT(!PageDirty(page));
578         if (fs_info->sectorsize == PAGE_SIZE)
579                 return;
580
581         ASSERT(PagePrivate(page) && page->private);
582         ASSERT(subpage->dirty_bitmap == 0);
583 }