Merge tag 'gpio-v5.3-2' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux...
[linux-2.6-microblaze.git] / fs / btrfs / space-info.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "ctree.h"
4 #include "space-info.h"
5 #include "sysfs.h"
6 #include "volumes.h"
7 #include "free-space-cache.h"
8 #include "ordered-data.h"
9 #include "transaction.h"
10 #include "math.h"
11
12 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
13                           bool may_use_included)
14 {
15         ASSERT(s_info);
16         return s_info->bytes_used + s_info->bytes_reserved +
17                 s_info->bytes_pinned + s_info->bytes_readonly +
18                 (may_use_included ? s_info->bytes_may_use : 0);
19 }
20
21 /*
22  * after adding space to the filesystem, we need to clear the full flags
23  * on all the space infos.
24  */
25 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
26 {
27         struct list_head *head = &info->space_info;
28         struct btrfs_space_info *found;
29
30         rcu_read_lock();
31         list_for_each_entry_rcu(found, head, list)
32                 found->full = 0;
33         rcu_read_unlock();
34 }
35
36 static const char *alloc_name(u64 flags)
37 {
38         switch (flags) {
39         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
40                 return "mixed";
41         case BTRFS_BLOCK_GROUP_METADATA:
42                 return "metadata";
43         case BTRFS_BLOCK_GROUP_DATA:
44                 return "data";
45         case BTRFS_BLOCK_GROUP_SYSTEM:
46                 return "system";
47         default:
48                 WARN_ON(1);
49                 return "invalid-combination";
50         };
51 }
52
53 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
54 {
55
56         struct btrfs_space_info *space_info;
57         int i;
58         int ret;
59
60         space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
61         if (!space_info)
62                 return -ENOMEM;
63
64         ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
65                                  GFP_KERNEL);
66         if (ret) {
67                 kfree(space_info);
68                 return ret;
69         }
70
71         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
72                 INIT_LIST_HEAD(&space_info->block_groups[i]);
73         init_rwsem(&space_info->groups_sem);
74         spin_lock_init(&space_info->lock);
75         space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
76         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
77         init_waitqueue_head(&space_info->wait);
78         INIT_LIST_HEAD(&space_info->ro_bgs);
79         INIT_LIST_HEAD(&space_info->tickets);
80         INIT_LIST_HEAD(&space_info->priority_tickets);
81
82         ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
83                                     info->space_info_kobj, "%s",
84                                     alloc_name(space_info->flags));
85         if (ret) {
86                 kobject_put(&space_info->kobj);
87                 return ret;
88         }
89
90         list_add_rcu(&space_info->list, &info->space_info);
91         if (flags & BTRFS_BLOCK_GROUP_DATA)
92                 info->data_sinfo = space_info;
93
94         return ret;
95 }
96
97 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
98 {
99         struct btrfs_super_block *disk_super;
100         u64 features;
101         u64 flags;
102         int mixed = 0;
103         int ret;
104
105         disk_super = fs_info->super_copy;
106         if (!btrfs_super_root(disk_super))
107                 return -EINVAL;
108
109         features = btrfs_super_incompat_flags(disk_super);
110         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
111                 mixed = 1;
112
113         flags = BTRFS_BLOCK_GROUP_SYSTEM;
114         ret = create_space_info(fs_info, flags);
115         if (ret)
116                 goto out;
117
118         if (mixed) {
119                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
120                 ret = create_space_info(fs_info, flags);
121         } else {
122                 flags = BTRFS_BLOCK_GROUP_METADATA;
123                 ret = create_space_info(fs_info, flags);
124                 if (ret)
125                         goto out;
126
127                 flags = BTRFS_BLOCK_GROUP_DATA;
128                 ret = create_space_info(fs_info, flags);
129         }
130 out:
131         return ret;
132 }
133
134 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
135                              u64 total_bytes, u64 bytes_used,
136                              u64 bytes_readonly,
137                              struct btrfs_space_info **space_info)
138 {
139         struct btrfs_space_info *found;
140         int factor;
141
142         factor = btrfs_bg_type_to_factor(flags);
143
144         found = btrfs_find_space_info(info, flags);
145         ASSERT(found);
146         spin_lock(&found->lock);
147         found->total_bytes += total_bytes;
148         found->disk_total += total_bytes * factor;
149         found->bytes_used += bytes_used;
150         found->disk_used += bytes_used * factor;
151         found->bytes_readonly += bytes_readonly;
152         if (total_bytes > 0)
153                 found->full = 0;
154         btrfs_space_info_add_new_bytes(info, found,
155                                        total_bytes - bytes_used -
156                                        bytes_readonly);
157         spin_unlock(&found->lock);
158         *space_info = found;
159 }
160
161 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
162                                                u64 flags)
163 {
164         struct list_head *head = &info->space_info;
165         struct btrfs_space_info *found;
166
167         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
168
169         rcu_read_lock();
170         list_for_each_entry_rcu(found, head, list) {
171                 if (found->flags & flags) {
172                         rcu_read_unlock();
173                         return found;
174                 }
175         }
176         rcu_read_unlock();
177         return NULL;
178 }
179
180 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
181 {
182         return (global->size << 1);
183 }
184
185 static int can_overcommit(struct btrfs_fs_info *fs_info,
186                           struct btrfs_space_info *space_info, u64 bytes,
187                           enum btrfs_reserve_flush_enum flush,
188                           bool system_chunk)
189 {
190         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
191         u64 profile;
192         u64 space_size;
193         u64 avail;
194         u64 used;
195         int factor;
196
197         /* Don't overcommit when in mixed mode. */
198         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
199                 return 0;
200
201         if (system_chunk)
202                 profile = btrfs_system_alloc_profile(fs_info);
203         else
204                 profile = btrfs_metadata_alloc_profile(fs_info);
205
206         used = btrfs_space_info_used(space_info, false);
207
208         /*
209          * We only want to allow over committing if we have lots of actual space
210          * free, but if we don't have enough space to handle the global reserve
211          * space then we could end up having a real enospc problem when trying
212          * to allocate a chunk or some other such important allocation.
213          */
214         spin_lock(&global_rsv->lock);
215         space_size = calc_global_rsv_need_space(global_rsv);
216         spin_unlock(&global_rsv->lock);
217         if (used + space_size >= space_info->total_bytes)
218                 return 0;
219
220         used += space_info->bytes_may_use;
221
222         avail = atomic64_read(&fs_info->free_chunk_space);
223
224         /*
225          * If we have dup, raid1 or raid10 then only half of the free
226          * space is actually usable.  For raid56, the space info used
227          * doesn't include the parity drive, so we don't have to
228          * change the math
229          */
230         factor = btrfs_bg_type_to_factor(profile);
231         avail = div_u64(avail, factor);
232
233         /*
234          * If we aren't flushing all things, let us overcommit up to
235          * 1/2th of the space. If we can flush, don't let us overcommit
236          * too much, let it overcommit up to 1/8 of the space.
237          */
238         if (flush == BTRFS_RESERVE_FLUSH_ALL)
239                 avail >>= 3;
240         else
241                 avail >>= 1;
242
243         if (used + bytes < space_info->total_bytes + avail)
244                 return 1;
245         return 0;
246 }
247
248 /*
249  * This is for space we already have accounted in space_info->bytes_may_use, so
250  * basically when we're returning space from block_rsv's.
251  */
252 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
253                                     struct btrfs_space_info *space_info,
254                                     u64 num_bytes)
255 {
256         struct reserve_ticket *ticket;
257         struct list_head *head;
258         u64 used;
259         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
260         bool check_overcommit = false;
261
262         spin_lock(&space_info->lock);
263         head = &space_info->priority_tickets;
264
265         /*
266          * If we are over our limit then we need to check and see if we can
267          * overcommit, and if we can't then we just need to free up our space
268          * and not satisfy any requests.
269          */
270         used = btrfs_space_info_used(space_info, true);
271         if (used - num_bytes >= space_info->total_bytes)
272                 check_overcommit = true;
273 again:
274         while (!list_empty(head) && num_bytes) {
275                 ticket = list_first_entry(head, struct reserve_ticket,
276                                           list);
277                 /*
278                  * We use 0 bytes because this space is already reserved, so
279                  * adding the ticket space would be a double count.
280                  */
281                 if (check_overcommit &&
282                     !can_overcommit(fs_info, space_info, 0, flush, false))
283                         break;
284                 if (num_bytes >= ticket->bytes) {
285                         list_del_init(&ticket->list);
286                         num_bytes -= ticket->bytes;
287                         ticket->bytes = 0;
288                         space_info->tickets_id++;
289                         wake_up(&ticket->wait);
290                 } else {
291                         ticket->bytes -= num_bytes;
292                         num_bytes = 0;
293                 }
294         }
295
296         if (num_bytes && head == &space_info->priority_tickets) {
297                 head = &space_info->tickets;
298                 flush = BTRFS_RESERVE_FLUSH_ALL;
299                 goto again;
300         }
301         btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
302         trace_btrfs_space_reservation(fs_info, "space_info",
303                                       space_info->flags, num_bytes, 0);
304         spin_unlock(&space_info->lock);
305 }
306
307 /*
308  * This is for newly allocated space that isn't accounted in
309  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
310  * we use this helper.
311  */
312 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
313                                     struct btrfs_space_info *space_info,
314                                     u64 num_bytes)
315 {
316         struct reserve_ticket *ticket;
317         struct list_head *head = &space_info->priority_tickets;
318
319 again:
320         while (!list_empty(head) && num_bytes) {
321                 ticket = list_first_entry(head, struct reserve_ticket,
322                                           list);
323                 if (num_bytes >= ticket->bytes) {
324                         trace_btrfs_space_reservation(fs_info, "space_info",
325                                                       space_info->flags,
326                                                       ticket->bytes, 1);
327                         list_del_init(&ticket->list);
328                         num_bytes -= ticket->bytes;
329                         btrfs_space_info_update_bytes_may_use(fs_info,
330                                                               space_info,
331                                                               ticket->bytes);
332                         ticket->bytes = 0;
333                         space_info->tickets_id++;
334                         wake_up(&ticket->wait);
335                 } else {
336                         trace_btrfs_space_reservation(fs_info, "space_info",
337                                                       space_info->flags,
338                                                       num_bytes, 1);
339                         btrfs_space_info_update_bytes_may_use(fs_info,
340                                                               space_info,
341                                                               num_bytes);
342                         ticket->bytes -= num_bytes;
343                         num_bytes = 0;
344                 }
345         }
346
347         if (num_bytes && head == &space_info->priority_tickets) {
348                 head = &space_info->tickets;
349                 goto again;
350         }
351 }
352
353 #define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
354 do {                                                                    \
355         struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
356         spin_lock(&__rsv->lock);                                        \
357         btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
358                    __rsv->size, __rsv->reserved);                       \
359         spin_unlock(&__rsv->lock);                                      \
360 } while (0)
361
362 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
363                            struct btrfs_space_info *info, u64 bytes,
364                            int dump_block_groups)
365 {
366         struct btrfs_block_group_cache *cache;
367         int index = 0;
368
369         spin_lock(&info->lock);
370         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
371                    info->flags,
372                    info->total_bytes - btrfs_space_info_used(info, true),
373                    info->full ? "" : "not ");
374         btrfs_info(fs_info,
375                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
376                 info->total_bytes, info->bytes_used, info->bytes_pinned,
377                 info->bytes_reserved, info->bytes_may_use,
378                 info->bytes_readonly);
379         spin_unlock(&info->lock);
380
381         DUMP_BLOCK_RSV(fs_info, global_block_rsv);
382         DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
383         DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
384         DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
385         DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
386
387         if (!dump_block_groups)
388                 return;
389
390         down_read(&info->groups_sem);
391 again:
392         list_for_each_entry(cache, &info->block_groups[index], list) {
393                 spin_lock(&cache->lock);
394                 btrfs_info(fs_info,
395                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
396                         cache->key.objectid, cache->key.offset,
397                         btrfs_block_group_used(&cache->item), cache->pinned,
398                         cache->reserved, cache->ro ? "[readonly]" : "");
399                 btrfs_dump_free_space(cache, bytes);
400                 spin_unlock(&cache->lock);
401         }
402         if (++index < BTRFS_NR_RAID_TYPES)
403                 goto again;
404         up_read(&info->groups_sem);
405 }
406
407 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
408                                          unsigned long nr_pages, int nr_items)
409 {
410         struct super_block *sb = fs_info->sb;
411
412         if (down_read_trylock(&sb->s_umount)) {
413                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
414                 up_read(&sb->s_umount);
415         } else {
416                 /*
417                  * We needn't worry the filesystem going from r/w to r/o though
418                  * we don't acquire ->s_umount mutex, because the filesystem
419                  * should guarantee the delalloc inodes list be empty after
420                  * the filesystem is readonly(all dirty pages are written to
421                  * the disk).
422                  */
423                 btrfs_start_delalloc_roots(fs_info, nr_items);
424                 if (!current->journal_info)
425                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
426         }
427 }
428
429 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
430                                         u64 to_reclaim)
431 {
432         u64 bytes;
433         u64 nr;
434
435         bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
436         nr = div64_u64(to_reclaim, bytes);
437         if (!nr)
438                 nr = 1;
439         return nr;
440 }
441
442 #define EXTENT_SIZE_PER_ITEM    SZ_256K
443
444 /*
445  * shrink metadata reservation for delalloc
446  */
447 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
448                             u64 orig, bool wait_ordered)
449 {
450         struct btrfs_space_info *space_info;
451         struct btrfs_trans_handle *trans;
452         u64 delalloc_bytes;
453         u64 dio_bytes;
454         u64 async_pages;
455         u64 items;
456         long time_left;
457         unsigned long nr_pages;
458         int loops;
459
460         /* Calc the number of the pages we need flush for space reservation */
461         items = calc_reclaim_items_nr(fs_info, to_reclaim);
462         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
463
464         trans = (struct btrfs_trans_handle *)current->journal_info;
465         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
466
467         delalloc_bytes = percpu_counter_sum_positive(
468                                                 &fs_info->delalloc_bytes);
469         dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
470         if (delalloc_bytes == 0 && dio_bytes == 0) {
471                 if (trans)
472                         return;
473                 if (wait_ordered)
474                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
475                 return;
476         }
477
478         /*
479          * If we are doing more ordered than delalloc we need to just wait on
480          * ordered extents, otherwise we'll waste time trying to flush delalloc
481          * that likely won't give us the space back we need.
482          */
483         if (dio_bytes > delalloc_bytes)
484                 wait_ordered = true;
485
486         loops = 0;
487         while ((delalloc_bytes || dio_bytes) && loops < 3) {
488                 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
489
490                 /*
491                  * Triggers inode writeback for up to nr_pages. This will invoke
492                  * ->writepages callback and trigger delalloc filling
493                  *  (btrfs_run_delalloc_range()).
494                  */
495                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
496
497                 /*
498                  * We need to wait for the compressed pages to start before
499                  * we continue.
500                  */
501                 async_pages = atomic_read(&fs_info->async_delalloc_pages);
502                 if (!async_pages)
503                         goto skip_async;
504
505                 /*
506                  * Calculate how many compressed pages we want to be written
507                  * before we continue. I.e if there are more async pages than we
508                  * require wait_event will wait until nr_pages are written.
509                  */
510                 if (async_pages <= nr_pages)
511                         async_pages = 0;
512                 else
513                         async_pages -= nr_pages;
514
515                 wait_event(fs_info->async_submit_wait,
516                            atomic_read(&fs_info->async_delalloc_pages) <=
517                            (int)async_pages);
518 skip_async:
519                 spin_lock(&space_info->lock);
520                 if (list_empty(&space_info->tickets) &&
521                     list_empty(&space_info->priority_tickets)) {
522                         spin_unlock(&space_info->lock);
523                         break;
524                 }
525                 spin_unlock(&space_info->lock);
526
527                 loops++;
528                 if (wait_ordered && !trans) {
529                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
530                 } else {
531                         time_left = schedule_timeout_killable(1);
532                         if (time_left)
533                                 break;
534                 }
535                 delalloc_bytes = percpu_counter_sum_positive(
536                                                 &fs_info->delalloc_bytes);
537                 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
538         }
539 }
540
541 /**
542  * maybe_commit_transaction - possibly commit the transaction if its ok to
543  * @root - the root we're allocating for
544  * @bytes - the number of bytes we want to reserve
545  * @force - force the commit
546  *
547  * This will check to make sure that committing the transaction will actually
548  * get us somewhere and then commit the transaction if it does.  Otherwise it
549  * will return -ENOSPC.
550  */
551 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
552                                   struct btrfs_space_info *space_info)
553 {
554         struct reserve_ticket *ticket = NULL;
555         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
556         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
557         struct btrfs_trans_handle *trans;
558         u64 bytes_needed;
559         u64 reclaim_bytes = 0;
560
561         trans = (struct btrfs_trans_handle *)current->journal_info;
562         if (trans)
563                 return -EAGAIN;
564
565         spin_lock(&space_info->lock);
566         if (!list_empty(&space_info->priority_tickets))
567                 ticket = list_first_entry(&space_info->priority_tickets,
568                                           struct reserve_ticket, list);
569         else if (!list_empty(&space_info->tickets))
570                 ticket = list_first_entry(&space_info->tickets,
571                                           struct reserve_ticket, list);
572         bytes_needed = (ticket) ? ticket->bytes : 0;
573         spin_unlock(&space_info->lock);
574
575         if (!bytes_needed)
576                 return 0;
577
578         trans = btrfs_join_transaction(fs_info->extent_root);
579         if (IS_ERR(trans))
580                 return PTR_ERR(trans);
581
582         /*
583          * See if there is enough pinned space to make this reservation, or if
584          * we have block groups that are going to be freed, allowing us to
585          * possibly do a chunk allocation the next loop through.
586          */
587         if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
588             __percpu_counter_compare(&space_info->total_bytes_pinned,
589                                      bytes_needed,
590                                      BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
591                 goto commit;
592
593         /*
594          * See if there is some space in the delayed insertion reservation for
595          * this reservation.
596          */
597         if (space_info != delayed_rsv->space_info)
598                 goto enospc;
599
600         spin_lock(&delayed_rsv->lock);
601         reclaim_bytes += delayed_rsv->reserved;
602         spin_unlock(&delayed_rsv->lock);
603
604         spin_lock(&delayed_refs_rsv->lock);
605         reclaim_bytes += delayed_refs_rsv->reserved;
606         spin_unlock(&delayed_refs_rsv->lock);
607         if (reclaim_bytes >= bytes_needed)
608                 goto commit;
609         bytes_needed -= reclaim_bytes;
610
611         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
612                                    bytes_needed,
613                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
614                 goto enospc;
615
616 commit:
617         return btrfs_commit_transaction(trans);
618 enospc:
619         btrfs_end_transaction(trans);
620         return -ENOSPC;
621 }
622
623 /*
624  * Try to flush some data based on policy set by @state. This is only advisory
625  * and may fail for various reasons. The caller is supposed to examine the
626  * state of @space_info to detect the outcome.
627  */
628 static void flush_space(struct btrfs_fs_info *fs_info,
629                        struct btrfs_space_info *space_info, u64 num_bytes,
630                        int state)
631 {
632         struct btrfs_root *root = fs_info->extent_root;
633         struct btrfs_trans_handle *trans;
634         int nr;
635         int ret = 0;
636
637         switch (state) {
638         case FLUSH_DELAYED_ITEMS_NR:
639         case FLUSH_DELAYED_ITEMS:
640                 if (state == FLUSH_DELAYED_ITEMS_NR)
641                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
642                 else
643                         nr = -1;
644
645                 trans = btrfs_join_transaction(root);
646                 if (IS_ERR(trans)) {
647                         ret = PTR_ERR(trans);
648                         break;
649                 }
650                 ret = btrfs_run_delayed_items_nr(trans, nr);
651                 btrfs_end_transaction(trans);
652                 break;
653         case FLUSH_DELALLOC:
654         case FLUSH_DELALLOC_WAIT:
655                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
656                                 state == FLUSH_DELALLOC_WAIT);
657                 break;
658         case FLUSH_DELAYED_REFS_NR:
659         case FLUSH_DELAYED_REFS:
660                 trans = btrfs_join_transaction(root);
661                 if (IS_ERR(trans)) {
662                         ret = PTR_ERR(trans);
663                         break;
664                 }
665                 if (state == FLUSH_DELAYED_REFS_NR)
666                         nr = calc_reclaim_items_nr(fs_info, num_bytes);
667                 else
668                         nr = 0;
669                 btrfs_run_delayed_refs(trans, nr);
670                 btrfs_end_transaction(trans);
671                 break;
672         case ALLOC_CHUNK:
673         case ALLOC_CHUNK_FORCE:
674                 trans = btrfs_join_transaction(root);
675                 if (IS_ERR(trans)) {
676                         ret = PTR_ERR(trans);
677                         break;
678                 }
679                 ret = btrfs_chunk_alloc(trans,
680                                 btrfs_metadata_alloc_profile(fs_info),
681                                 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
682                                         CHUNK_ALLOC_FORCE);
683                 btrfs_end_transaction(trans);
684                 if (ret > 0 || ret == -ENOSPC)
685                         ret = 0;
686                 break;
687         case COMMIT_TRANS:
688                 /*
689                  * If we have pending delayed iputs then we could free up a
690                  * bunch of pinned space, so make sure we run the iputs before
691                  * we do our pinned bytes check below.
692                  */
693                 btrfs_run_delayed_iputs(fs_info);
694                 btrfs_wait_on_delayed_iputs(fs_info);
695
696                 ret = may_commit_transaction(fs_info, space_info);
697                 break;
698         default:
699                 ret = -ENOSPC;
700                 break;
701         }
702
703         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
704                                 ret);
705         return;
706 }
707
708 static inline u64
709 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
710                                  struct btrfs_space_info *space_info,
711                                  bool system_chunk)
712 {
713         struct reserve_ticket *ticket;
714         u64 used;
715         u64 expected;
716         u64 to_reclaim = 0;
717
718         list_for_each_entry(ticket, &space_info->tickets, list)
719                 to_reclaim += ticket->bytes;
720         list_for_each_entry(ticket, &space_info->priority_tickets, list)
721                 to_reclaim += ticket->bytes;
722         if (to_reclaim)
723                 return to_reclaim;
724
725         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
726         if (can_overcommit(fs_info, space_info, to_reclaim,
727                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
728                 return 0;
729
730         used = btrfs_space_info_used(space_info, true);
731
732         if (can_overcommit(fs_info, space_info, SZ_1M,
733                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
734                 expected = div_factor_fine(space_info->total_bytes, 95);
735         else
736                 expected = div_factor_fine(space_info->total_bytes, 90);
737
738         if (used > expected)
739                 to_reclaim = used - expected;
740         else
741                 to_reclaim = 0;
742         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
743                                      space_info->bytes_reserved);
744         return to_reclaim;
745 }
746
747 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
748                                         struct btrfs_space_info *space_info,
749                                         u64 used, bool system_chunk)
750 {
751         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
752
753         /* If we're just plain full then async reclaim just slows us down. */
754         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
755                 return 0;
756
757         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
758                                               system_chunk))
759                 return 0;
760
761         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
762                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
763 }
764
765 static bool wake_all_tickets(struct list_head *head)
766 {
767         struct reserve_ticket *ticket;
768
769         while (!list_empty(head)) {
770                 ticket = list_first_entry(head, struct reserve_ticket, list);
771                 list_del_init(&ticket->list);
772                 ticket->error = -ENOSPC;
773                 wake_up(&ticket->wait);
774                 if (ticket->bytes != ticket->orig_bytes)
775                         return true;
776         }
777         return false;
778 }
779
780 /*
781  * This is for normal flushers, we can wait all goddamned day if we want to.  We
782  * will loop and continuously try to flush as long as we are making progress.
783  * We count progress as clearing off tickets each time we have to loop.
784  */
785 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
786 {
787         struct btrfs_fs_info *fs_info;
788         struct btrfs_space_info *space_info;
789         u64 to_reclaim;
790         int flush_state;
791         int commit_cycles = 0;
792         u64 last_tickets_id;
793
794         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
795         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
796
797         spin_lock(&space_info->lock);
798         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
799                                                       false);
800         if (!to_reclaim) {
801                 space_info->flush = 0;
802                 spin_unlock(&space_info->lock);
803                 return;
804         }
805         last_tickets_id = space_info->tickets_id;
806         spin_unlock(&space_info->lock);
807
808         flush_state = FLUSH_DELAYED_ITEMS_NR;
809         do {
810                 flush_space(fs_info, space_info, to_reclaim, flush_state);
811                 spin_lock(&space_info->lock);
812                 if (list_empty(&space_info->tickets)) {
813                         space_info->flush = 0;
814                         spin_unlock(&space_info->lock);
815                         return;
816                 }
817                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
818                                                               space_info,
819                                                               false);
820                 if (last_tickets_id == space_info->tickets_id) {
821                         flush_state++;
822                 } else {
823                         last_tickets_id = space_info->tickets_id;
824                         flush_state = FLUSH_DELAYED_ITEMS_NR;
825                         if (commit_cycles)
826                                 commit_cycles--;
827                 }
828
829                 /*
830                  * We don't want to force a chunk allocation until we've tried
831                  * pretty hard to reclaim space.  Think of the case where we
832                  * freed up a bunch of space and so have a lot of pinned space
833                  * to reclaim.  We would rather use that than possibly create a
834                  * underutilized metadata chunk.  So if this is our first run
835                  * through the flushing state machine skip ALLOC_CHUNK_FORCE and
836                  * commit the transaction.  If nothing has changed the next go
837                  * around then we can force a chunk allocation.
838                  */
839                 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
840                         flush_state++;
841
842                 if (flush_state > COMMIT_TRANS) {
843                         commit_cycles++;
844                         if (commit_cycles > 2) {
845                                 if (wake_all_tickets(&space_info->tickets)) {
846                                         flush_state = FLUSH_DELAYED_ITEMS_NR;
847                                         commit_cycles--;
848                                 } else {
849                                         space_info->flush = 0;
850                                 }
851                         } else {
852                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
853                         }
854                 }
855                 spin_unlock(&space_info->lock);
856         } while (flush_state <= COMMIT_TRANS);
857 }
858
859 void btrfs_init_async_reclaim_work(struct work_struct *work)
860 {
861         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
862 }
863
864 static const enum btrfs_flush_state priority_flush_states[] = {
865         FLUSH_DELAYED_ITEMS_NR,
866         FLUSH_DELAYED_ITEMS,
867         ALLOC_CHUNK,
868 };
869
870 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
871                                             struct btrfs_space_info *space_info,
872                                             struct reserve_ticket *ticket)
873 {
874         u64 to_reclaim;
875         int flush_state;
876
877         spin_lock(&space_info->lock);
878         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
879                                                       false);
880         if (!to_reclaim) {
881                 spin_unlock(&space_info->lock);
882                 return;
883         }
884         spin_unlock(&space_info->lock);
885
886         flush_state = 0;
887         do {
888                 flush_space(fs_info, space_info, to_reclaim,
889                             priority_flush_states[flush_state]);
890                 flush_state++;
891                 spin_lock(&space_info->lock);
892                 if (ticket->bytes == 0) {
893                         spin_unlock(&space_info->lock);
894                         return;
895                 }
896                 spin_unlock(&space_info->lock);
897         } while (flush_state < ARRAY_SIZE(priority_flush_states));
898 }
899
900 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
901                                struct btrfs_space_info *space_info,
902                                struct reserve_ticket *ticket)
903
904 {
905         DEFINE_WAIT(wait);
906         u64 reclaim_bytes = 0;
907         int ret = 0;
908
909         spin_lock(&space_info->lock);
910         while (ticket->bytes > 0 && ticket->error == 0) {
911                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
912                 if (ret) {
913                         ret = -EINTR;
914                         break;
915                 }
916                 spin_unlock(&space_info->lock);
917
918                 schedule();
919
920                 finish_wait(&ticket->wait, &wait);
921                 spin_lock(&space_info->lock);
922         }
923         if (!ret)
924                 ret = ticket->error;
925         if (!list_empty(&ticket->list))
926                 list_del_init(&ticket->list);
927         if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
928                 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
929         spin_unlock(&space_info->lock);
930
931         if (reclaim_bytes)
932                 btrfs_space_info_add_old_bytes(fs_info, space_info,
933                                                reclaim_bytes);
934         return ret;
935 }
936
937 /**
938  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
939  * @root - the root we're allocating for
940  * @space_info - the space info we want to allocate from
941  * @orig_bytes - the number of bytes we want
942  * @flush - whether or not we can flush to make our reservation
943  *
944  * This will reserve orig_bytes number of bytes from the space info associated
945  * with the block_rsv.  If there is not enough space it will make an attempt to
946  * flush out space to make room.  It will do this by flushing delalloc if
947  * possible or committing the transaction.  If flush is 0 then no attempts to
948  * regain reservations will be made and this will fail if there is not enough
949  * space already.
950  */
951 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
952                                     struct btrfs_space_info *space_info,
953                                     u64 orig_bytes,
954                                     enum btrfs_reserve_flush_enum flush,
955                                     bool system_chunk)
956 {
957         struct reserve_ticket ticket;
958         u64 used;
959         u64 reclaim_bytes = 0;
960         int ret = 0;
961
962         ASSERT(orig_bytes);
963         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
964
965         spin_lock(&space_info->lock);
966         ret = -ENOSPC;
967         used = btrfs_space_info_used(space_info, true);
968
969         /*
970          * Carry on if we have enough space (short-circuit) OR call
971          * can_overcommit() to ensure we can overcommit to continue.
972          */
973         if ((used + orig_bytes <= space_info->total_bytes) ||
974             can_overcommit(fs_info, space_info, orig_bytes, flush,
975                            system_chunk)) {
976                 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
977                                                       orig_bytes);
978                 trace_btrfs_space_reservation(fs_info, "space_info",
979                                               space_info->flags, orig_bytes, 1);
980                 ret = 0;
981         }
982
983         /*
984          * If we couldn't make a reservation then setup our reservation ticket
985          * and kick the async worker if it's not already running.
986          *
987          * If we are a priority flusher then we just need to add our ticket to
988          * the list and we will do our own flushing further down.
989          */
990         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
991                 ticket.orig_bytes = orig_bytes;
992                 ticket.bytes = orig_bytes;
993                 ticket.error = 0;
994                 init_waitqueue_head(&ticket.wait);
995                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
996                         list_add_tail(&ticket.list, &space_info->tickets);
997                         if (!space_info->flush) {
998                                 space_info->flush = 1;
999                                 trace_btrfs_trigger_flush(fs_info,
1000                                                           space_info->flags,
1001                                                           orig_bytes, flush,
1002                                                           "enospc");
1003                                 queue_work(system_unbound_wq,
1004                                            &fs_info->async_reclaim_work);
1005                         }
1006                 } else {
1007                         list_add_tail(&ticket.list,
1008                                       &space_info->priority_tickets);
1009                 }
1010         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1011                 used += orig_bytes;
1012                 /*
1013                  * We will do the space reservation dance during log replay,
1014                  * which means we won't have fs_info->fs_root set, so don't do
1015                  * the async reclaim as we will panic.
1016                  */
1017                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1018                     need_do_async_reclaim(fs_info, space_info,
1019                                           used, system_chunk) &&
1020                     !work_busy(&fs_info->async_reclaim_work)) {
1021                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
1022                                                   orig_bytes, flush, "preempt");
1023                         queue_work(system_unbound_wq,
1024                                    &fs_info->async_reclaim_work);
1025                 }
1026         }
1027         spin_unlock(&space_info->lock);
1028         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1029                 return ret;
1030
1031         if (flush == BTRFS_RESERVE_FLUSH_ALL)
1032                 return wait_reserve_ticket(fs_info, space_info, &ticket);
1033
1034         ret = 0;
1035         priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1036         spin_lock(&space_info->lock);
1037         if (ticket.bytes) {
1038                 if (ticket.bytes < orig_bytes)
1039                         reclaim_bytes = orig_bytes - ticket.bytes;
1040                 list_del_init(&ticket.list);
1041                 ret = -ENOSPC;
1042         }
1043         spin_unlock(&space_info->lock);
1044
1045         if (reclaim_bytes)
1046                 btrfs_space_info_add_old_bytes(fs_info, space_info,
1047                                                reclaim_bytes);
1048         ASSERT(list_empty(&ticket.list));
1049         return ret;
1050 }
1051
1052 /**
1053  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1054  * @root - the root we're allocating for
1055  * @block_rsv - the block_rsv we're allocating for
1056  * @orig_bytes - the number of bytes we want
1057  * @flush - whether or not we can flush to make our reservation
1058  *
1059  * This will reserve orig_bytes number of bytes from the space info associated
1060  * with the block_rsv.  If there is not enough space it will make an attempt to
1061  * flush out space to make room.  It will do this by flushing delalloc if
1062  * possible or committing the transaction.  If flush is 0 then no attempts to
1063  * regain reservations will be made and this will fail if there is not enough
1064  * space already.
1065  */
1066 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1067                                  struct btrfs_block_rsv *block_rsv,
1068                                  u64 orig_bytes,
1069                                  enum btrfs_reserve_flush_enum flush)
1070 {
1071         struct btrfs_fs_info *fs_info = root->fs_info;
1072         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1073         int ret;
1074         bool system_chunk = (root == fs_info->chunk_root);
1075
1076         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1077                                        orig_bytes, flush, system_chunk);
1078         if (ret == -ENOSPC &&
1079             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1080                 if (block_rsv != global_rsv &&
1081                     !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1082                         ret = 0;
1083         }
1084         if (ret == -ENOSPC) {
1085                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1086                                               block_rsv->space_info->flags,
1087                                               orig_bytes, 1);
1088
1089                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1090                         btrfs_dump_space_info(fs_info, block_rsv->space_info,
1091                                               orig_bytes, 0);
1092         }
1093         return ret;
1094 }