Merge tag 'mmc-v5.7-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc
[linux-2.6-microblaze.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "free-space-cache.h"
16 #include "transaction.h"
17 #include "disk-io.h"
18 #include "extent_io.h"
19 #include "inode-map.h"
20 #include "volumes.h"
21 #include "space-info.h"
22 #include "delalloc-space.h"
23 #include "block-group.h"
24 #include "discard.h"
25
26 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
27 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
28 #define FORCE_EXTENT_THRESHOLD  SZ_1M
29
30 struct btrfs_trim_range {
31         u64 start;
32         u64 bytes;
33         struct list_head list;
34 };
35
36 static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
37                                 struct btrfs_free_space *bitmap_info);
38 static int link_free_space(struct btrfs_free_space_ctl *ctl,
39                            struct btrfs_free_space *info);
40 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
41                               struct btrfs_free_space *info);
42 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
43                              struct btrfs_trans_handle *trans,
44                              struct btrfs_io_ctl *io_ctl,
45                              struct btrfs_path *path);
46
47 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
48                                                struct btrfs_path *path,
49                                                u64 offset)
50 {
51         struct btrfs_fs_info *fs_info = root->fs_info;
52         struct btrfs_key key;
53         struct btrfs_key location;
54         struct btrfs_disk_key disk_key;
55         struct btrfs_free_space_header *header;
56         struct extent_buffer *leaf;
57         struct inode *inode = NULL;
58         unsigned nofs_flag;
59         int ret;
60
61         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
62         key.offset = offset;
63         key.type = 0;
64
65         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
66         if (ret < 0)
67                 return ERR_PTR(ret);
68         if (ret > 0) {
69                 btrfs_release_path(path);
70                 return ERR_PTR(-ENOENT);
71         }
72
73         leaf = path->nodes[0];
74         header = btrfs_item_ptr(leaf, path->slots[0],
75                                 struct btrfs_free_space_header);
76         btrfs_free_space_key(leaf, header, &disk_key);
77         btrfs_disk_key_to_cpu(&location, &disk_key);
78         btrfs_release_path(path);
79
80         /*
81          * We are often under a trans handle at this point, so we need to make
82          * sure NOFS is set to keep us from deadlocking.
83          */
84         nofs_flag = memalloc_nofs_save();
85         inode = btrfs_iget_path(fs_info->sb, &location, root, path);
86         btrfs_release_path(path);
87         memalloc_nofs_restore(nofs_flag);
88         if (IS_ERR(inode))
89                 return inode;
90
91         mapping_set_gfp_mask(inode->i_mapping,
92                         mapping_gfp_constraint(inode->i_mapping,
93                         ~(__GFP_FS | __GFP_HIGHMEM)));
94
95         return inode;
96 }
97
98 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
99                 struct btrfs_path *path)
100 {
101         struct btrfs_fs_info *fs_info = block_group->fs_info;
102         struct inode *inode = NULL;
103         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
104
105         spin_lock(&block_group->lock);
106         if (block_group->inode)
107                 inode = igrab(block_group->inode);
108         spin_unlock(&block_group->lock);
109         if (inode)
110                 return inode;
111
112         inode = __lookup_free_space_inode(fs_info->tree_root, path,
113                                           block_group->start);
114         if (IS_ERR(inode))
115                 return inode;
116
117         spin_lock(&block_group->lock);
118         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
119                 btrfs_info(fs_info, "Old style space inode found, converting.");
120                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
121                         BTRFS_INODE_NODATACOW;
122                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
123         }
124
125         if (!block_group->iref) {
126                 block_group->inode = igrab(inode);
127                 block_group->iref = 1;
128         }
129         spin_unlock(&block_group->lock);
130
131         return inode;
132 }
133
134 static int __create_free_space_inode(struct btrfs_root *root,
135                                      struct btrfs_trans_handle *trans,
136                                      struct btrfs_path *path,
137                                      u64 ino, u64 offset)
138 {
139         struct btrfs_key key;
140         struct btrfs_disk_key disk_key;
141         struct btrfs_free_space_header *header;
142         struct btrfs_inode_item *inode_item;
143         struct extent_buffer *leaf;
144         u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
145         int ret;
146
147         ret = btrfs_insert_empty_inode(trans, root, path, ino);
148         if (ret)
149                 return ret;
150
151         /* We inline crc's for the free disk space cache */
152         if (ino != BTRFS_FREE_INO_OBJECTID)
153                 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
154
155         leaf = path->nodes[0];
156         inode_item = btrfs_item_ptr(leaf, path->slots[0],
157                                     struct btrfs_inode_item);
158         btrfs_item_key(leaf, &disk_key, path->slots[0]);
159         memzero_extent_buffer(leaf, (unsigned long)inode_item,
160                              sizeof(*inode_item));
161         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
162         btrfs_set_inode_size(leaf, inode_item, 0);
163         btrfs_set_inode_nbytes(leaf, inode_item, 0);
164         btrfs_set_inode_uid(leaf, inode_item, 0);
165         btrfs_set_inode_gid(leaf, inode_item, 0);
166         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
167         btrfs_set_inode_flags(leaf, inode_item, flags);
168         btrfs_set_inode_nlink(leaf, inode_item, 1);
169         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
170         btrfs_set_inode_block_group(leaf, inode_item, offset);
171         btrfs_mark_buffer_dirty(leaf);
172         btrfs_release_path(path);
173
174         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
175         key.offset = offset;
176         key.type = 0;
177         ret = btrfs_insert_empty_item(trans, root, path, &key,
178                                       sizeof(struct btrfs_free_space_header));
179         if (ret < 0) {
180                 btrfs_release_path(path);
181                 return ret;
182         }
183
184         leaf = path->nodes[0];
185         header = btrfs_item_ptr(leaf, path->slots[0],
186                                 struct btrfs_free_space_header);
187         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
188         btrfs_set_free_space_key(leaf, header, &disk_key);
189         btrfs_mark_buffer_dirty(leaf);
190         btrfs_release_path(path);
191
192         return 0;
193 }
194
195 int create_free_space_inode(struct btrfs_trans_handle *trans,
196                             struct btrfs_block_group *block_group,
197                             struct btrfs_path *path)
198 {
199         int ret;
200         u64 ino;
201
202         ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
203         if (ret < 0)
204                 return ret;
205
206         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
207                                          ino, block_group->start);
208 }
209
210 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
211                                        struct btrfs_block_rsv *rsv)
212 {
213         u64 needed_bytes;
214         int ret;
215
216         /* 1 for slack space, 1 for updating the inode */
217         needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
218                 btrfs_calc_metadata_size(fs_info, 1);
219
220         spin_lock(&rsv->lock);
221         if (rsv->reserved < needed_bytes)
222                 ret = -ENOSPC;
223         else
224                 ret = 0;
225         spin_unlock(&rsv->lock);
226         return ret;
227 }
228
229 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
230                                     struct btrfs_block_group *block_group,
231                                     struct inode *inode)
232 {
233         struct btrfs_root *root = BTRFS_I(inode)->root;
234         int ret = 0;
235         bool locked = false;
236
237         if (block_group) {
238                 struct btrfs_path *path = btrfs_alloc_path();
239
240                 if (!path) {
241                         ret = -ENOMEM;
242                         goto fail;
243                 }
244                 locked = true;
245                 mutex_lock(&trans->transaction->cache_write_mutex);
246                 if (!list_empty(&block_group->io_list)) {
247                         list_del_init(&block_group->io_list);
248
249                         btrfs_wait_cache_io(trans, block_group, path);
250                         btrfs_put_block_group(block_group);
251                 }
252
253                 /*
254                  * now that we've truncated the cache away, its no longer
255                  * setup or written
256                  */
257                 spin_lock(&block_group->lock);
258                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
259                 spin_unlock(&block_group->lock);
260                 btrfs_free_path(path);
261         }
262
263         btrfs_i_size_write(BTRFS_I(inode), 0);
264         truncate_pagecache(inode, 0);
265
266         /*
267          * We skip the throttling logic for free space cache inodes, so we don't
268          * need to check for -EAGAIN.
269          */
270         ret = btrfs_truncate_inode_items(trans, root, inode,
271                                          0, BTRFS_EXTENT_DATA_KEY);
272         if (ret)
273                 goto fail;
274
275         ret = btrfs_update_inode(trans, root, inode);
276
277 fail:
278         if (locked)
279                 mutex_unlock(&trans->transaction->cache_write_mutex);
280         if (ret)
281                 btrfs_abort_transaction(trans, ret);
282
283         return ret;
284 }
285
286 static void readahead_cache(struct inode *inode)
287 {
288         struct file_ra_state *ra;
289         unsigned long last_index;
290
291         ra = kzalloc(sizeof(*ra), GFP_NOFS);
292         if (!ra)
293                 return;
294
295         file_ra_state_init(ra, inode->i_mapping);
296         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
297
298         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
299
300         kfree(ra);
301 }
302
303 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
304                        int write)
305 {
306         int num_pages;
307         int check_crcs = 0;
308
309         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
310
311         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
312                 check_crcs = 1;
313
314         /* Make sure we can fit our crcs and generation into the first page */
315         if (write && check_crcs &&
316             (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
317                 return -ENOSPC;
318
319         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
320
321         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
322         if (!io_ctl->pages)
323                 return -ENOMEM;
324
325         io_ctl->num_pages = num_pages;
326         io_ctl->fs_info = btrfs_sb(inode->i_sb);
327         io_ctl->check_crcs = check_crcs;
328         io_ctl->inode = inode;
329
330         return 0;
331 }
332 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
333
334 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
335 {
336         kfree(io_ctl->pages);
337         io_ctl->pages = NULL;
338 }
339
340 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
341 {
342         if (io_ctl->cur) {
343                 io_ctl->cur = NULL;
344                 io_ctl->orig = NULL;
345         }
346 }
347
348 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
349 {
350         ASSERT(io_ctl->index < io_ctl->num_pages);
351         io_ctl->page = io_ctl->pages[io_ctl->index++];
352         io_ctl->cur = page_address(io_ctl->page);
353         io_ctl->orig = io_ctl->cur;
354         io_ctl->size = PAGE_SIZE;
355         if (clear)
356                 clear_page(io_ctl->cur);
357 }
358
359 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
360 {
361         int i;
362
363         io_ctl_unmap_page(io_ctl);
364
365         for (i = 0; i < io_ctl->num_pages; i++) {
366                 if (io_ctl->pages[i]) {
367                         ClearPageChecked(io_ctl->pages[i]);
368                         unlock_page(io_ctl->pages[i]);
369                         put_page(io_ctl->pages[i]);
370                 }
371         }
372 }
373
374 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
375 {
376         struct page *page;
377         struct inode *inode = io_ctl->inode;
378         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
379         int i;
380
381         for (i = 0; i < io_ctl->num_pages; i++) {
382                 page = find_or_create_page(inode->i_mapping, i, mask);
383                 if (!page) {
384                         io_ctl_drop_pages(io_ctl);
385                         return -ENOMEM;
386                 }
387                 io_ctl->pages[i] = page;
388                 if (uptodate && !PageUptodate(page)) {
389                         btrfs_readpage(NULL, page);
390                         lock_page(page);
391                         if (page->mapping != inode->i_mapping) {
392                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
393                                           "free space cache page truncated");
394                                 io_ctl_drop_pages(io_ctl);
395                                 return -EIO;
396                         }
397                         if (!PageUptodate(page)) {
398                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
399                                            "error reading free space cache");
400                                 io_ctl_drop_pages(io_ctl);
401                                 return -EIO;
402                         }
403                 }
404         }
405
406         for (i = 0; i < io_ctl->num_pages; i++) {
407                 clear_page_dirty_for_io(io_ctl->pages[i]);
408                 set_page_extent_mapped(io_ctl->pages[i]);
409         }
410
411         return 0;
412 }
413
414 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
415 {
416         __le64 *val;
417
418         io_ctl_map_page(io_ctl, 1);
419
420         /*
421          * Skip the csum areas.  If we don't check crcs then we just have a
422          * 64bit chunk at the front of the first page.
423          */
424         if (io_ctl->check_crcs) {
425                 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
426                 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
427         } else {
428                 io_ctl->cur += sizeof(u64);
429                 io_ctl->size -= sizeof(u64) * 2;
430         }
431
432         val = io_ctl->cur;
433         *val = cpu_to_le64(generation);
434         io_ctl->cur += sizeof(u64);
435 }
436
437 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
438 {
439         __le64 *gen;
440
441         /*
442          * Skip the crc area.  If we don't check crcs then we just have a 64bit
443          * chunk at the front of the first page.
444          */
445         if (io_ctl->check_crcs) {
446                 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
447                 io_ctl->size -= sizeof(u64) +
448                         (sizeof(u32) * io_ctl->num_pages);
449         } else {
450                 io_ctl->cur += sizeof(u64);
451                 io_ctl->size -= sizeof(u64) * 2;
452         }
453
454         gen = io_ctl->cur;
455         if (le64_to_cpu(*gen) != generation) {
456                 btrfs_err_rl(io_ctl->fs_info,
457                         "space cache generation (%llu) does not match inode (%llu)",
458                                 *gen, generation);
459                 io_ctl_unmap_page(io_ctl);
460                 return -EIO;
461         }
462         io_ctl->cur += sizeof(u64);
463         return 0;
464 }
465
466 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
467 {
468         u32 *tmp;
469         u32 crc = ~(u32)0;
470         unsigned offset = 0;
471
472         if (!io_ctl->check_crcs) {
473                 io_ctl_unmap_page(io_ctl);
474                 return;
475         }
476
477         if (index == 0)
478                 offset = sizeof(u32) * io_ctl->num_pages;
479
480         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
481         btrfs_crc32c_final(crc, (u8 *)&crc);
482         io_ctl_unmap_page(io_ctl);
483         tmp = page_address(io_ctl->pages[0]);
484         tmp += index;
485         *tmp = crc;
486 }
487
488 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
489 {
490         u32 *tmp, val;
491         u32 crc = ~(u32)0;
492         unsigned offset = 0;
493
494         if (!io_ctl->check_crcs) {
495                 io_ctl_map_page(io_ctl, 0);
496                 return 0;
497         }
498
499         if (index == 0)
500                 offset = sizeof(u32) * io_ctl->num_pages;
501
502         tmp = page_address(io_ctl->pages[0]);
503         tmp += index;
504         val = *tmp;
505
506         io_ctl_map_page(io_ctl, 0);
507         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
508         btrfs_crc32c_final(crc, (u8 *)&crc);
509         if (val != crc) {
510                 btrfs_err_rl(io_ctl->fs_info,
511                         "csum mismatch on free space cache");
512                 io_ctl_unmap_page(io_ctl);
513                 return -EIO;
514         }
515
516         return 0;
517 }
518
519 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
520                             void *bitmap)
521 {
522         struct btrfs_free_space_entry *entry;
523
524         if (!io_ctl->cur)
525                 return -ENOSPC;
526
527         entry = io_ctl->cur;
528         entry->offset = cpu_to_le64(offset);
529         entry->bytes = cpu_to_le64(bytes);
530         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
531                 BTRFS_FREE_SPACE_EXTENT;
532         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
533         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
534
535         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
536                 return 0;
537
538         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
539
540         /* No more pages to map */
541         if (io_ctl->index >= io_ctl->num_pages)
542                 return 0;
543
544         /* map the next page */
545         io_ctl_map_page(io_ctl, 1);
546         return 0;
547 }
548
549 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
550 {
551         if (!io_ctl->cur)
552                 return -ENOSPC;
553
554         /*
555          * If we aren't at the start of the current page, unmap this one and
556          * map the next one if there is any left.
557          */
558         if (io_ctl->cur != io_ctl->orig) {
559                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
560                 if (io_ctl->index >= io_ctl->num_pages)
561                         return -ENOSPC;
562                 io_ctl_map_page(io_ctl, 0);
563         }
564
565         copy_page(io_ctl->cur, bitmap);
566         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
567         if (io_ctl->index < io_ctl->num_pages)
568                 io_ctl_map_page(io_ctl, 0);
569         return 0;
570 }
571
572 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
573 {
574         /*
575          * If we're not on the boundary we know we've modified the page and we
576          * need to crc the page.
577          */
578         if (io_ctl->cur != io_ctl->orig)
579                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
580         else
581                 io_ctl_unmap_page(io_ctl);
582
583         while (io_ctl->index < io_ctl->num_pages) {
584                 io_ctl_map_page(io_ctl, 1);
585                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
586         }
587 }
588
589 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
590                             struct btrfs_free_space *entry, u8 *type)
591 {
592         struct btrfs_free_space_entry *e;
593         int ret;
594
595         if (!io_ctl->cur) {
596                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
597                 if (ret)
598                         return ret;
599         }
600
601         e = io_ctl->cur;
602         entry->offset = le64_to_cpu(e->offset);
603         entry->bytes = le64_to_cpu(e->bytes);
604         *type = e->type;
605         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
606         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
607
608         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
609                 return 0;
610
611         io_ctl_unmap_page(io_ctl);
612
613         return 0;
614 }
615
616 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
617                               struct btrfs_free_space *entry)
618 {
619         int ret;
620
621         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
622         if (ret)
623                 return ret;
624
625         copy_page(entry->bitmap, io_ctl->cur);
626         io_ctl_unmap_page(io_ctl);
627
628         return 0;
629 }
630
631 /*
632  * Since we attach pinned extents after the fact we can have contiguous sections
633  * of free space that are split up in entries.  This poses a problem with the
634  * tree logging stuff since it could have allocated across what appears to be 2
635  * entries since we would have merged the entries when adding the pinned extents
636  * back to the free space cache.  So run through the space cache that we just
637  * loaded and merge contiguous entries.  This will make the log replay stuff not
638  * blow up and it will make for nicer allocator behavior.
639  */
640 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
641 {
642         struct btrfs_free_space *e, *prev = NULL;
643         struct rb_node *n;
644
645 again:
646         spin_lock(&ctl->tree_lock);
647         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
648                 e = rb_entry(n, struct btrfs_free_space, offset_index);
649                 if (!prev)
650                         goto next;
651                 if (e->bitmap || prev->bitmap)
652                         goto next;
653                 if (prev->offset + prev->bytes == e->offset) {
654                         unlink_free_space(ctl, prev);
655                         unlink_free_space(ctl, e);
656                         prev->bytes += e->bytes;
657                         kmem_cache_free(btrfs_free_space_cachep, e);
658                         link_free_space(ctl, prev);
659                         prev = NULL;
660                         spin_unlock(&ctl->tree_lock);
661                         goto again;
662                 }
663 next:
664                 prev = e;
665         }
666         spin_unlock(&ctl->tree_lock);
667 }
668
669 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
670                                    struct btrfs_free_space_ctl *ctl,
671                                    struct btrfs_path *path, u64 offset)
672 {
673         struct btrfs_fs_info *fs_info = root->fs_info;
674         struct btrfs_free_space_header *header;
675         struct extent_buffer *leaf;
676         struct btrfs_io_ctl io_ctl;
677         struct btrfs_key key;
678         struct btrfs_free_space *e, *n;
679         LIST_HEAD(bitmaps);
680         u64 num_entries;
681         u64 num_bitmaps;
682         u64 generation;
683         u8 type;
684         int ret = 0;
685
686         /* Nothing in the space cache, goodbye */
687         if (!i_size_read(inode))
688                 return 0;
689
690         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
691         key.offset = offset;
692         key.type = 0;
693
694         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
695         if (ret < 0)
696                 return 0;
697         else if (ret > 0) {
698                 btrfs_release_path(path);
699                 return 0;
700         }
701
702         ret = -1;
703
704         leaf = path->nodes[0];
705         header = btrfs_item_ptr(leaf, path->slots[0],
706                                 struct btrfs_free_space_header);
707         num_entries = btrfs_free_space_entries(leaf, header);
708         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
709         generation = btrfs_free_space_generation(leaf, header);
710         btrfs_release_path(path);
711
712         if (!BTRFS_I(inode)->generation) {
713                 btrfs_info(fs_info,
714                            "the free space cache file (%llu) is invalid, skip it",
715                            offset);
716                 return 0;
717         }
718
719         if (BTRFS_I(inode)->generation != generation) {
720                 btrfs_err(fs_info,
721                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
722                           BTRFS_I(inode)->generation, generation);
723                 return 0;
724         }
725
726         if (!num_entries)
727                 return 0;
728
729         ret = io_ctl_init(&io_ctl, inode, 0);
730         if (ret)
731                 return ret;
732
733         readahead_cache(inode);
734
735         ret = io_ctl_prepare_pages(&io_ctl, true);
736         if (ret)
737                 goto out;
738
739         ret = io_ctl_check_crc(&io_ctl, 0);
740         if (ret)
741                 goto free_cache;
742
743         ret = io_ctl_check_generation(&io_ctl, generation);
744         if (ret)
745                 goto free_cache;
746
747         while (num_entries) {
748                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
749                                       GFP_NOFS);
750                 if (!e)
751                         goto free_cache;
752
753                 ret = io_ctl_read_entry(&io_ctl, e, &type);
754                 if (ret) {
755                         kmem_cache_free(btrfs_free_space_cachep, e);
756                         goto free_cache;
757                 }
758
759                 /*
760                  * Sync discard ensures that the free space cache is always
761                  * trimmed.  So when reading this in, the state should reflect
762                  * that.  We also do this for async as a stop gap for lack of
763                  * persistence.
764                  */
765                 if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
766                     btrfs_test_opt(fs_info, DISCARD_ASYNC))
767                         e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
768
769                 if (!e->bytes) {
770                         kmem_cache_free(btrfs_free_space_cachep, e);
771                         goto free_cache;
772                 }
773
774                 if (type == BTRFS_FREE_SPACE_EXTENT) {
775                         spin_lock(&ctl->tree_lock);
776                         ret = link_free_space(ctl, e);
777                         spin_unlock(&ctl->tree_lock);
778                         if (ret) {
779                                 btrfs_err(fs_info,
780                                         "Duplicate entries in free space cache, dumping");
781                                 kmem_cache_free(btrfs_free_space_cachep, e);
782                                 goto free_cache;
783                         }
784                 } else {
785                         ASSERT(num_bitmaps);
786                         num_bitmaps--;
787                         e->bitmap = kmem_cache_zalloc(
788                                         btrfs_free_space_bitmap_cachep, GFP_NOFS);
789                         if (!e->bitmap) {
790                                 kmem_cache_free(
791                                         btrfs_free_space_cachep, e);
792                                 goto free_cache;
793                         }
794                         spin_lock(&ctl->tree_lock);
795                         ret = link_free_space(ctl, e);
796                         ctl->total_bitmaps++;
797                         ctl->op->recalc_thresholds(ctl);
798                         spin_unlock(&ctl->tree_lock);
799                         if (ret) {
800                                 btrfs_err(fs_info,
801                                         "Duplicate entries in free space cache, dumping");
802                                 kmem_cache_free(btrfs_free_space_cachep, e);
803                                 goto free_cache;
804                         }
805                         list_add_tail(&e->list, &bitmaps);
806                 }
807
808                 num_entries--;
809         }
810
811         io_ctl_unmap_page(&io_ctl);
812
813         /*
814          * We add the bitmaps at the end of the entries in order that
815          * the bitmap entries are added to the cache.
816          */
817         list_for_each_entry_safe(e, n, &bitmaps, list) {
818                 list_del_init(&e->list);
819                 ret = io_ctl_read_bitmap(&io_ctl, e);
820                 if (ret)
821                         goto free_cache;
822                 e->bitmap_extents = count_bitmap_extents(ctl, e);
823                 if (!btrfs_free_space_trimmed(e)) {
824                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
825                                 e->bitmap_extents;
826                         ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
827                 }
828         }
829
830         io_ctl_drop_pages(&io_ctl);
831         merge_space_tree(ctl);
832         ret = 1;
833 out:
834         btrfs_discard_update_discardable(ctl->private, ctl);
835         io_ctl_free(&io_ctl);
836         return ret;
837 free_cache:
838         io_ctl_drop_pages(&io_ctl);
839         __btrfs_remove_free_space_cache(ctl);
840         goto out;
841 }
842
843 int load_free_space_cache(struct btrfs_block_group *block_group)
844 {
845         struct btrfs_fs_info *fs_info = block_group->fs_info;
846         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
847         struct inode *inode;
848         struct btrfs_path *path;
849         int ret = 0;
850         bool matched;
851         u64 used = block_group->used;
852
853         /*
854          * If this block group has been marked to be cleared for one reason or
855          * another then we can't trust the on disk cache, so just return.
856          */
857         spin_lock(&block_group->lock);
858         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
859                 spin_unlock(&block_group->lock);
860                 return 0;
861         }
862         spin_unlock(&block_group->lock);
863
864         path = btrfs_alloc_path();
865         if (!path)
866                 return 0;
867         path->search_commit_root = 1;
868         path->skip_locking = 1;
869
870         /*
871          * We must pass a path with search_commit_root set to btrfs_iget in
872          * order to avoid a deadlock when allocating extents for the tree root.
873          *
874          * When we are COWing an extent buffer from the tree root, when looking
875          * for a free extent, at extent-tree.c:find_free_extent(), we can find
876          * block group without its free space cache loaded. When we find one
877          * we must load its space cache which requires reading its free space
878          * cache's inode item from the root tree. If this inode item is located
879          * in the same leaf that we started COWing before, then we end up in
880          * deadlock on the extent buffer (trying to read lock it when we
881          * previously write locked it).
882          *
883          * It's safe to read the inode item using the commit root because
884          * block groups, once loaded, stay in memory forever (until they are
885          * removed) as well as their space caches once loaded. New block groups
886          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
887          * we will never try to read their inode item while the fs is mounted.
888          */
889         inode = lookup_free_space_inode(block_group, path);
890         if (IS_ERR(inode)) {
891                 btrfs_free_path(path);
892                 return 0;
893         }
894
895         /* We may have converted the inode and made the cache invalid. */
896         spin_lock(&block_group->lock);
897         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
898                 spin_unlock(&block_group->lock);
899                 btrfs_free_path(path);
900                 goto out;
901         }
902         spin_unlock(&block_group->lock);
903
904         ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
905                                       path, block_group->start);
906         btrfs_free_path(path);
907         if (ret <= 0)
908                 goto out;
909
910         spin_lock(&ctl->tree_lock);
911         matched = (ctl->free_space == (block_group->length - used -
912                                        block_group->bytes_super));
913         spin_unlock(&ctl->tree_lock);
914
915         if (!matched) {
916                 __btrfs_remove_free_space_cache(ctl);
917                 btrfs_warn(fs_info,
918                            "block group %llu has wrong amount of free space",
919                            block_group->start);
920                 ret = -1;
921         }
922 out:
923         if (ret < 0) {
924                 /* This cache is bogus, make sure it gets cleared */
925                 spin_lock(&block_group->lock);
926                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
927                 spin_unlock(&block_group->lock);
928                 ret = 0;
929
930                 btrfs_warn(fs_info,
931                            "failed to load free space cache for block group %llu, rebuilding it now",
932                            block_group->start);
933         }
934
935         iput(inode);
936         return ret;
937 }
938
939 static noinline_for_stack
940 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
941                               struct btrfs_free_space_ctl *ctl,
942                               struct btrfs_block_group *block_group,
943                               int *entries, int *bitmaps,
944                               struct list_head *bitmap_list)
945 {
946         int ret;
947         struct btrfs_free_cluster *cluster = NULL;
948         struct btrfs_free_cluster *cluster_locked = NULL;
949         struct rb_node *node = rb_first(&ctl->free_space_offset);
950         struct btrfs_trim_range *trim_entry;
951
952         /* Get the cluster for this block_group if it exists */
953         if (block_group && !list_empty(&block_group->cluster_list)) {
954                 cluster = list_entry(block_group->cluster_list.next,
955                                      struct btrfs_free_cluster,
956                                      block_group_list);
957         }
958
959         if (!node && cluster) {
960                 cluster_locked = cluster;
961                 spin_lock(&cluster_locked->lock);
962                 node = rb_first(&cluster->root);
963                 cluster = NULL;
964         }
965
966         /* Write out the extent entries */
967         while (node) {
968                 struct btrfs_free_space *e;
969
970                 e = rb_entry(node, struct btrfs_free_space, offset_index);
971                 *entries += 1;
972
973                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
974                                        e->bitmap);
975                 if (ret)
976                         goto fail;
977
978                 if (e->bitmap) {
979                         list_add_tail(&e->list, bitmap_list);
980                         *bitmaps += 1;
981                 }
982                 node = rb_next(node);
983                 if (!node && cluster) {
984                         node = rb_first(&cluster->root);
985                         cluster_locked = cluster;
986                         spin_lock(&cluster_locked->lock);
987                         cluster = NULL;
988                 }
989         }
990         if (cluster_locked) {
991                 spin_unlock(&cluster_locked->lock);
992                 cluster_locked = NULL;
993         }
994
995         /*
996          * Make sure we don't miss any range that was removed from our rbtree
997          * because trimming is running. Otherwise after a umount+mount (or crash
998          * after committing the transaction) we would leak free space and get
999          * an inconsistent free space cache report from fsck.
1000          */
1001         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1002                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1003                                        trim_entry->bytes, NULL);
1004                 if (ret)
1005                         goto fail;
1006                 *entries += 1;
1007         }
1008
1009         return 0;
1010 fail:
1011         if (cluster_locked)
1012                 spin_unlock(&cluster_locked->lock);
1013         return -ENOSPC;
1014 }
1015
1016 static noinline_for_stack int
1017 update_cache_item(struct btrfs_trans_handle *trans,
1018                   struct btrfs_root *root,
1019                   struct inode *inode,
1020                   struct btrfs_path *path, u64 offset,
1021                   int entries, int bitmaps)
1022 {
1023         struct btrfs_key key;
1024         struct btrfs_free_space_header *header;
1025         struct extent_buffer *leaf;
1026         int ret;
1027
1028         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1029         key.offset = offset;
1030         key.type = 0;
1031
1032         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1033         if (ret < 0) {
1034                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1035                                  EXTENT_DELALLOC, 0, 0, NULL);
1036                 goto fail;
1037         }
1038         leaf = path->nodes[0];
1039         if (ret > 0) {
1040                 struct btrfs_key found_key;
1041                 ASSERT(path->slots[0]);
1042                 path->slots[0]--;
1043                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1044                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1045                     found_key.offset != offset) {
1046                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1047                                          inode->i_size - 1, EXTENT_DELALLOC, 0,
1048                                          0, NULL);
1049                         btrfs_release_path(path);
1050                         goto fail;
1051                 }
1052         }
1053
1054         BTRFS_I(inode)->generation = trans->transid;
1055         header = btrfs_item_ptr(leaf, path->slots[0],
1056                                 struct btrfs_free_space_header);
1057         btrfs_set_free_space_entries(leaf, header, entries);
1058         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1059         btrfs_set_free_space_generation(leaf, header, trans->transid);
1060         btrfs_mark_buffer_dirty(leaf);
1061         btrfs_release_path(path);
1062
1063         return 0;
1064
1065 fail:
1066         return -1;
1067 }
1068
1069 static noinline_for_stack int write_pinned_extent_entries(
1070                             struct btrfs_trans_handle *trans,
1071                             struct btrfs_block_group *block_group,
1072                             struct btrfs_io_ctl *io_ctl,
1073                             int *entries)
1074 {
1075         u64 start, extent_start, extent_end, len;
1076         struct extent_io_tree *unpin = NULL;
1077         int ret;
1078
1079         if (!block_group)
1080                 return 0;
1081
1082         /*
1083          * We want to add any pinned extents to our free space cache
1084          * so we don't leak the space
1085          *
1086          * We shouldn't have switched the pinned extents yet so this is the
1087          * right one
1088          */
1089         unpin = &trans->transaction->pinned_extents;
1090
1091         start = block_group->start;
1092
1093         while (start < block_group->start + block_group->length) {
1094                 ret = find_first_extent_bit(unpin, start,
1095                                             &extent_start, &extent_end,
1096                                             EXTENT_DIRTY, NULL);
1097                 if (ret)
1098                         return 0;
1099
1100                 /* This pinned extent is out of our range */
1101                 if (extent_start >= block_group->start + block_group->length)
1102                         return 0;
1103
1104                 extent_start = max(extent_start, start);
1105                 extent_end = min(block_group->start + block_group->length,
1106                                  extent_end + 1);
1107                 len = extent_end - extent_start;
1108
1109                 *entries += 1;
1110                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1111                 if (ret)
1112                         return -ENOSPC;
1113
1114                 start = extent_end;
1115         }
1116
1117         return 0;
1118 }
1119
1120 static noinline_for_stack int
1121 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1122 {
1123         struct btrfs_free_space *entry, *next;
1124         int ret;
1125
1126         /* Write out the bitmaps */
1127         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1128                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1129                 if (ret)
1130                         return -ENOSPC;
1131                 list_del_init(&entry->list);
1132         }
1133
1134         return 0;
1135 }
1136
1137 static int flush_dirty_cache(struct inode *inode)
1138 {
1139         int ret;
1140
1141         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1142         if (ret)
1143                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1144                                  EXTENT_DELALLOC, 0, 0, NULL);
1145
1146         return ret;
1147 }
1148
1149 static void noinline_for_stack
1150 cleanup_bitmap_list(struct list_head *bitmap_list)
1151 {
1152         struct btrfs_free_space *entry, *next;
1153
1154         list_for_each_entry_safe(entry, next, bitmap_list, list)
1155                 list_del_init(&entry->list);
1156 }
1157
1158 static void noinline_for_stack
1159 cleanup_write_cache_enospc(struct inode *inode,
1160                            struct btrfs_io_ctl *io_ctl,
1161                            struct extent_state **cached_state)
1162 {
1163         io_ctl_drop_pages(io_ctl);
1164         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1165                              i_size_read(inode) - 1, cached_state);
1166 }
1167
1168 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1169                                  struct btrfs_trans_handle *trans,
1170                                  struct btrfs_block_group *block_group,
1171                                  struct btrfs_io_ctl *io_ctl,
1172                                  struct btrfs_path *path, u64 offset)
1173 {
1174         int ret;
1175         struct inode *inode = io_ctl->inode;
1176
1177         if (!inode)
1178                 return 0;
1179
1180         /* Flush the dirty pages in the cache file. */
1181         ret = flush_dirty_cache(inode);
1182         if (ret)
1183                 goto out;
1184
1185         /* Update the cache item to tell everyone this cache file is valid. */
1186         ret = update_cache_item(trans, root, inode, path, offset,
1187                                 io_ctl->entries, io_ctl->bitmaps);
1188 out:
1189         io_ctl_free(io_ctl);
1190         if (ret) {
1191                 invalidate_inode_pages2(inode->i_mapping);
1192                 BTRFS_I(inode)->generation = 0;
1193                 if (block_group) {
1194 #ifdef CONFIG_BTRFS_DEBUG
1195                         btrfs_err(root->fs_info,
1196                                   "failed to write free space cache for block group %llu",
1197                                   block_group->start);
1198 #endif
1199                 }
1200         }
1201         btrfs_update_inode(trans, root, inode);
1202
1203         if (block_group) {
1204                 /* the dirty list is protected by the dirty_bgs_lock */
1205                 spin_lock(&trans->transaction->dirty_bgs_lock);
1206
1207                 /* the disk_cache_state is protected by the block group lock */
1208                 spin_lock(&block_group->lock);
1209
1210                 /*
1211                  * only mark this as written if we didn't get put back on
1212                  * the dirty list while waiting for IO.   Otherwise our
1213                  * cache state won't be right, and we won't get written again
1214                  */
1215                 if (!ret && list_empty(&block_group->dirty_list))
1216                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1217                 else if (ret)
1218                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1219
1220                 spin_unlock(&block_group->lock);
1221                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1222                 io_ctl->inode = NULL;
1223                 iput(inode);
1224         }
1225
1226         return ret;
1227
1228 }
1229
1230 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1231                                     struct btrfs_trans_handle *trans,
1232                                     struct btrfs_io_ctl *io_ctl,
1233                                     struct btrfs_path *path)
1234 {
1235         return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1236 }
1237
1238 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1239                         struct btrfs_block_group *block_group,
1240                         struct btrfs_path *path)
1241 {
1242         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1243                                      block_group, &block_group->io_ctl,
1244                                      path, block_group->start);
1245 }
1246
1247 /**
1248  * __btrfs_write_out_cache - write out cached info to an inode
1249  * @root - the root the inode belongs to
1250  * @ctl - the free space cache we are going to write out
1251  * @block_group - the block_group for this cache if it belongs to a block_group
1252  * @trans - the trans handle
1253  *
1254  * This function writes out a free space cache struct to disk for quick recovery
1255  * on mount.  This will return 0 if it was successful in writing the cache out,
1256  * or an errno if it was not.
1257  */
1258 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1259                                    struct btrfs_free_space_ctl *ctl,
1260                                    struct btrfs_block_group *block_group,
1261                                    struct btrfs_io_ctl *io_ctl,
1262                                    struct btrfs_trans_handle *trans)
1263 {
1264         struct extent_state *cached_state = NULL;
1265         LIST_HEAD(bitmap_list);
1266         int entries = 0;
1267         int bitmaps = 0;
1268         int ret;
1269         int must_iput = 0;
1270
1271         if (!i_size_read(inode))
1272                 return -EIO;
1273
1274         WARN_ON(io_ctl->pages);
1275         ret = io_ctl_init(io_ctl, inode, 1);
1276         if (ret)
1277                 return ret;
1278
1279         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1280                 down_write(&block_group->data_rwsem);
1281                 spin_lock(&block_group->lock);
1282                 if (block_group->delalloc_bytes) {
1283                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1284                         spin_unlock(&block_group->lock);
1285                         up_write(&block_group->data_rwsem);
1286                         BTRFS_I(inode)->generation = 0;
1287                         ret = 0;
1288                         must_iput = 1;
1289                         goto out;
1290                 }
1291                 spin_unlock(&block_group->lock);
1292         }
1293
1294         /* Lock all pages first so we can lock the extent safely. */
1295         ret = io_ctl_prepare_pages(io_ctl, false);
1296         if (ret)
1297                 goto out_unlock;
1298
1299         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1300                          &cached_state);
1301
1302         io_ctl_set_generation(io_ctl, trans->transid);
1303
1304         mutex_lock(&ctl->cache_writeout_mutex);
1305         /* Write out the extent entries in the free space cache */
1306         spin_lock(&ctl->tree_lock);
1307         ret = write_cache_extent_entries(io_ctl, ctl,
1308                                          block_group, &entries, &bitmaps,
1309                                          &bitmap_list);
1310         if (ret)
1311                 goto out_nospc_locked;
1312
1313         /*
1314          * Some spaces that are freed in the current transaction are pinned,
1315          * they will be added into free space cache after the transaction is
1316          * committed, we shouldn't lose them.
1317          *
1318          * If this changes while we are working we'll get added back to
1319          * the dirty list and redo it.  No locking needed
1320          */
1321         ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1322         if (ret)
1323                 goto out_nospc_locked;
1324
1325         /*
1326          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1327          * locked while doing it because a concurrent trim can be manipulating
1328          * or freeing the bitmap.
1329          */
1330         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1331         spin_unlock(&ctl->tree_lock);
1332         mutex_unlock(&ctl->cache_writeout_mutex);
1333         if (ret)
1334                 goto out_nospc;
1335
1336         /* Zero out the rest of the pages just to make sure */
1337         io_ctl_zero_remaining_pages(io_ctl);
1338
1339         /* Everything is written out, now we dirty the pages in the file. */
1340         ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1341                                 i_size_read(inode), &cached_state);
1342         if (ret)
1343                 goto out_nospc;
1344
1345         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1346                 up_write(&block_group->data_rwsem);
1347         /*
1348          * Release the pages and unlock the extent, we will flush
1349          * them out later
1350          */
1351         io_ctl_drop_pages(io_ctl);
1352
1353         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1354                              i_size_read(inode) - 1, &cached_state);
1355
1356         /*
1357          * at this point the pages are under IO and we're happy,
1358          * The caller is responsible for waiting on them and updating the
1359          * the cache and the inode
1360          */
1361         io_ctl->entries = entries;
1362         io_ctl->bitmaps = bitmaps;
1363
1364         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1365         if (ret)
1366                 goto out;
1367
1368         return 0;
1369
1370 out_nospc_locked:
1371         cleanup_bitmap_list(&bitmap_list);
1372         spin_unlock(&ctl->tree_lock);
1373         mutex_unlock(&ctl->cache_writeout_mutex);
1374
1375 out_nospc:
1376         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1377
1378 out_unlock:
1379         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1380                 up_write(&block_group->data_rwsem);
1381
1382 out:
1383         io_ctl->inode = NULL;
1384         io_ctl_free(io_ctl);
1385         if (ret) {
1386                 invalidate_inode_pages2(inode->i_mapping);
1387                 BTRFS_I(inode)->generation = 0;
1388         }
1389         btrfs_update_inode(trans, root, inode);
1390         if (must_iput)
1391                 iput(inode);
1392         return ret;
1393 }
1394
1395 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1396                           struct btrfs_block_group *block_group,
1397                           struct btrfs_path *path)
1398 {
1399         struct btrfs_fs_info *fs_info = trans->fs_info;
1400         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1401         struct inode *inode;
1402         int ret = 0;
1403
1404         spin_lock(&block_group->lock);
1405         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1406                 spin_unlock(&block_group->lock);
1407                 return 0;
1408         }
1409         spin_unlock(&block_group->lock);
1410
1411         inode = lookup_free_space_inode(block_group, path);
1412         if (IS_ERR(inode))
1413                 return 0;
1414
1415         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1416                                 block_group, &block_group->io_ctl, trans);
1417         if (ret) {
1418 #ifdef CONFIG_BTRFS_DEBUG
1419                 btrfs_err(fs_info,
1420                           "failed to write free space cache for block group %llu",
1421                           block_group->start);
1422 #endif
1423                 spin_lock(&block_group->lock);
1424                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1425                 spin_unlock(&block_group->lock);
1426
1427                 block_group->io_ctl.inode = NULL;
1428                 iput(inode);
1429         }
1430
1431         /*
1432          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1433          * to wait for IO and put the inode
1434          */
1435
1436         return ret;
1437 }
1438
1439 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1440                                           u64 offset)
1441 {
1442         ASSERT(offset >= bitmap_start);
1443         offset -= bitmap_start;
1444         return (unsigned long)(div_u64(offset, unit));
1445 }
1446
1447 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1448 {
1449         return (unsigned long)(div_u64(bytes, unit));
1450 }
1451
1452 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1453                                    u64 offset)
1454 {
1455         u64 bitmap_start;
1456         u64 bytes_per_bitmap;
1457
1458         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1459         bitmap_start = offset - ctl->start;
1460         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1461         bitmap_start *= bytes_per_bitmap;
1462         bitmap_start += ctl->start;
1463
1464         return bitmap_start;
1465 }
1466
1467 static int tree_insert_offset(struct rb_root *root, u64 offset,
1468                               struct rb_node *node, int bitmap)
1469 {
1470         struct rb_node **p = &root->rb_node;
1471         struct rb_node *parent = NULL;
1472         struct btrfs_free_space *info;
1473
1474         while (*p) {
1475                 parent = *p;
1476                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1477
1478                 if (offset < info->offset) {
1479                         p = &(*p)->rb_left;
1480                 } else if (offset > info->offset) {
1481                         p = &(*p)->rb_right;
1482                 } else {
1483                         /*
1484                          * we could have a bitmap entry and an extent entry
1485                          * share the same offset.  If this is the case, we want
1486                          * the extent entry to always be found first if we do a
1487                          * linear search through the tree, since we want to have
1488                          * the quickest allocation time, and allocating from an
1489                          * extent is faster than allocating from a bitmap.  So
1490                          * if we're inserting a bitmap and we find an entry at
1491                          * this offset, we want to go right, or after this entry
1492                          * logically.  If we are inserting an extent and we've
1493                          * found a bitmap, we want to go left, or before
1494                          * logically.
1495                          */
1496                         if (bitmap) {
1497                                 if (info->bitmap) {
1498                                         WARN_ON_ONCE(1);
1499                                         return -EEXIST;
1500                                 }
1501                                 p = &(*p)->rb_right;
1502                         } else {
1503                                 if (!info->bitmap) {
1504                                         WARN_ON_ONCE(1);
1505                                         return -EEXIST;
1506                                 }
1507                                 p = &(*p)->rb_left;
1508                         }
1509                 }
1510         }
1511
1512         rb_link_node(node, parent, p);
1513         rb_insert_color(node, root);
1514
1515         return 0;
1516 }
1517
1518 /*
1519  * searches the tree for the given offset.
1520  *
1521  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1522  * want a section that has at least bytes size and comes at or after the given
1523  * offset.
1524  */
1525 static struct btrfs_free_space *
1526 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1527                    u64 offset, int bitmap_only, int fuzzy)
1528 {
1529         struct rb_node *n = ctl->free_space_offset.rb_node;
1530         struct btrfs_free_space *entry, *prev = NULL;
1531
1532         /* find entry that is closest to the 'offset' */
1533         while (1) {
1534                 if (!n) {
1535                         entry = NULL;
1536                         break;
1537                 }
1538
1539                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1540                 prev = entry;
1541
1542                 if (offset < entry->offset)
1543                         n = n->rb_left;
1544                 else if (offset > entry->offset)
1545                         n = n->rb_right;
1546                 else
1547                         break;
1548         }
1549
1550         if (bitmap_only) {
1551                 if (!entry)
1552                         return NULL;
1553                 if (entry->bitmap)
1554                         return entry;
1555
1556                 /*
1557                  * bitmap entry and extent entry may share same offset,
1558                  * in that case, bitmap entry comes after extent entry.
1559                  */
1560                 n = rb_next(n);
1561                 if (!n)
1562                         return NULL;
1563                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1564                 if (entry->offset != offset)
1565                         return NULL;
1566
1567                 WARN_ON(!entry->bitmap);
1568                 return entry;
1569         } else if (entry) {
1570                 if (entry->bitmap) {
1571                         /*
1572                          * if previous extent entry covers the offset,
1573                          * we should return it instead of the bitmap entry
1574                          */
1575                         n = rb_prev(&entry->offset_index);
1576                         if (n) {
1577                                 prev = rb_entry(n, struct btrfs_free_space,
1578                                                 offset_index);
1579                                 if (!prev->bitmap &&
1580                                     prev->offset + prev->bytes > offset)
1581                                         entry = prev;
1582                         }
1583                 }
1584                 return entry;
1585         }
1586
1587         if (!prev)
1588                 return NULL;
1589
1590         /* find last entry before the 'offset' */
1591         entry = prev;
1592         if (entry->offset > offset) {
1593                 n = rb_prev(&entry->offset_index);
1594                 if (n) {
1595                         entry = rb_entry(n, struct btrfs_free_space,
1596                                         offset_index);
1597                         ASSERT(entry->offset <= offset);
1598                 } else {
1599                         if (fuzzy)
1600                                 return entry;
1601                         else
1602                                 return NULL;
1603                 }
1604         }
1605
1606         if (entry->bitmap) {
1607                 n = rb_prev(&entry->offset_index);
1608                 if (n) {
1609                         prev = rb_entry(n, struct btrfs_free_space,
1610                                         offset_index);
1611                         if (!prev->bitmap &&
1612                             prev->offset + prev->bytes > offset)
1613                                 return prev;
1614                 }
1615                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1616                         return entry;
1617         } else if (entry->offset + entry->bytes > offset)
1618                 return entry;
1619
1620         if (!fuzzy)
1621                 return NULL;
1622
1623         while (1) {
1624                 if (entry->bitmap) {
1625                         if (entry->offset + BITS_PER_BITMAP *
1626                             ctl->unit > offset)
1627                                 break;
1628                 } else {
1629                         if (entry->offset + entry->bytes > offset)
1630                                 break;
1631                 }
1632
1633                 n = rb_next(&entry->offset_index);
1634                 if (!n)
1635                         return NULL;
1636                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1637         }
1638         return entry;
1639 }
1640
1641 static inline void
1642 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1643                     struct btrfs_free_space *info)
1644 {
1645         rb_erase(&info->offset_index, &ctl->free_space_offset);
1646         ctl->free_extents--;
1647
1648         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1649                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
1650                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1651         }
1652 }
1653
1654 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1655                               struct btrfs_free_space *info)
1656 {
1657         __unlink_free_space(ctl, info);
1658         ctl->free_space -= info->bytes;
1659 }
1660
1661 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1662                            struct btrfs_free_space *info)
1663 {
1664         int ret = 0;
1665
1666         ASSERT(info->bytes || info->bitmap);
1667         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1668                                  &info->offset_index, (info->bitmap != NULL));
1669         if (ret)
1670                 return ret;
1671
1672         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1673                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
1674                 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1675         }
1676
1677         ctl->free_space += info->bytes;
1678         ctl->free_extents++;
1679         return ret;
1680 }
1681
1682 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1683 {
1684         struct btrfs_block_group *block_group = ctl->private;
1685         u64 max_bytes;
1686         u64 bitmap_bytes;
1687         u64 extent_bytes;
1688         u64 size = block_group->length;
1689         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1690         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1691
1692         max_bitmaps = max_t(u64, max_bitmaps, 1);
1693
1694         ASSERT(ctl->total_bitmaps <= max_bitmaps);
1695
1696         /*
1697          * We are trying to keep the total amount of memory used per 1GiB of
1698          * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
1699          * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
1700          * bitmaps, we may end up using more memory than this.
1701          */
1702         if (size < SZ_1G)
1703                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1704         else
1705                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1706
1707         bitmap_bytes = ctl->total_bitmaps * ctl->unit;
1708
1709         /*
1710          * we want the extent entry threshold to always be at most 1/2 the max
1711          * bytes we can have, or whatever is less than that.
1712          */
1713         extent_bytes = max_bytes - bitmap_bytes;
1714         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1715
1716         ctl->extents_thresh =
1717                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1718 }
1719
1720 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1721                                        struct btrfs_free_space *info,
1722                                        u64 offset, u64 bytes)
1723 {
1724         unsigned long start, count, end;
1725         int extent_delta = -1;
1726
1727         start = offset_to_bit(info->offset, ctl->unit, offset);
1728         count = bytes_to_bits(bytes, ctl->unit);
1729         end = start + count;
1730         ASSERT(end <= BITS_PER_BITMAP);
1731
1732         bitmap_clear(info->bitmap, start, count);
1733
1734         info->bytes -= bytes;
1735         if (info->max_extent_size > ctl->unit)
1736                 info->max_extent_size = 0;
1737
1738         if (start && test_bit(start - 1, info->bitmap))
1739                 extent_delta++;
1740
1741         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1742                 extent_delta++;
1743
1744         info->bitmap_extents += extent_delta;
1745         if (!btrfs_free_space_trimmed(info)) {
1746                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1747                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1748         }
1749 }
1750
1751 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1752                               struct btrfs_free_space *info, u64 offset,
1753                               u64 bytes)
1754 {
1755         __bitmap_clear_bits(ctl, info, offset, bytes);
1756         ctl->free_space -= bytes;
1757 }
1758
1759 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1760                             struct btrfs_free_space *info, u64 offset,
1761                             u64 bytes)
1762 {
1763         unsigned long start, count, end;
1764         int extent_delta = 1;
1765
1766         start = offset_to_bit(info->offset, ctl->unit, offset);
1767         count = bytes_to_bits(bytes, ctl->unit);
1768         end = start + count;
1769         ASSERT(end <= BITS_PER_BITMAP);
1770
1771         bitmap_set(info->bitmap, start, count);
1772
1773         info->bytes += bytes;
1774         ctl->free_space += bytes;
1775
1776         if (start && test_bit(start - 1, info->bitmap))
1777                 extent_delta--;
1778
1779         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1780                 extent_delta--;
1781
1782         info->bitmap_extents += extent_delta;
1783         if (!btrfs_free_space_trimmed(info)) {
1784                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1785                 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1786         }
1787 }
1788
1789 /*
1790  * If we can not find suitable extent, we will use bytes to record
1791  * the size of the max extent.
1792  */
1793 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1794                          struct btrfs_free_space *bitmap_info, u64 *offset,
1795                          u64 *bytes, bool for_alloc)
1796 {
1797         unsigned long found_bits = 0;
1798         unsigned long max_bits = 0;
1799         unsigned long bits, i;
1800         unsigned long next_zero;
1801         unsigned long extent_bits;
1802
1803         /*
1804          * Skip searching the bitmap if we don't have a contiguous section that
1805          * is large enough for this allocation.
1806          */
1807         if (for_alloc &&
1808             bitmap_info->max_extent_size &&
1809             bitmap_info->max_extent_size < *bytes) {
1810                 *bytes = bitmap_info->max_extent_size;
1811                 return -1;
1812         }
1813
1814         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1815                           max_t(u64, *offset, bitmap_info->offset));
1816         bits = bytes_to_bits(*bytes, ctl->unit);
1817
1818         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1819                 if (for_alloc && bits == 1) {
1820                         found_bits = 1;
1821                         break;
1822                 }
1823                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1824                                                BITS_PER_BITMAP, i);
1825                 extent_bits = next_zero - i;
1826                 if (extent_bits >= bits) {
1827                         found_bits = extent_bits;
1828                         break;
1829                 } else if (extent_bits > max_bits) {
1830                         max_bits = extent_bits;
1831                 }
1832                 i = next_zero;
1833         }
1834
1835         if (found_bits) {
1836                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1837                 *bytes = (u64)(found_bits) * ctl->unit;
1838                 return 0;
1839         }
1840
1841         *bytes = (u64)(max_bits) * ctl->unit;
1842         bitmap_info->max_extent_size = *bytes;
1843         return -1;
1844 }
1845
1846 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1847 {
1848         if (entry->bitmap)
1849                 return entry->max_extent_size;
1850         return entry->bytes;
1851 }
1852
1853 /* Cache the size of the max extent in bytes */
1854 static struct btrfs_free_space *
1855 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1856                 unsigned long align, u64 *max_extent_size)
1857 {
1858         struct btrfs_free_space *entry;
1859         struct rb_node *node;
1860         u64 tmp;
1861         u64 align_off;
1862         int ret;
1863
1864         if (!ctl->free_space_offset.rb_node)
1865                 goto out;
1866
1867         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1868         if (!entry)
1869                 goto out;
1870
1871         for (node = &entry->offset_index; node; node = rb_next(node)) {
1872                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1873                 if (entry->bytes < *bytes) {
1874                         *max_extent_size = max(get_max_extent_size(entry),
1875                                                *max_extent_size);
1876                         continue;
1877                 }
1878
1879                 /* make sure the space returned is big enough
1880                  * to match our requested alignment
1881                  */
1882                 if (*bytes >= align) {
1883                         tmp = entry->offset - ctl->start + align - 1;
1884                         tmp = div64_u64(tmp, align);
1885                         tmp = tmp * align + ctl->start;
1886                         align_off = tmp - entry->offset;
1887                 } else {
1888                         align_off = 0;
1889                         tmp = entry->offset;
1890                 }
1891
1892                 if (entry->bytes < *bytes + align_off) {
1893                         *max_extent_size = max(get_max_extent_size(entry),
1894                                                *max_extent_size);
1895                         continue;
1896                 }
1897
1898                 if (entry->bitmap) {
1899                         u64 size = *bytes;
1900
1901                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1902                         if (!ret) {
1903                                 *offset = tmp;
1904                                 *bytes = size;
1905                                 return entry;
1906                         } else {
1907                                 *max_extent_size =
1908                                         max(get_max_extent_size(entry),
1909                                             *max_extent_size);
1910                         }
1911                         continue;
1912                 }
1913
1914                 *offset = tmp;
1915                 *bytes = entry->bytes - align_off;
1916                 return entry;
1917         }
1918 out:
1919         return NULL;
1920 }
1921
1922 static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
1923                                 struct btrfs_free_space *bitmap_info)
1924 {
1925         struct btrfs_block_group *block_group = ctl->private;
1926         u64 bytes = bitmap_info->bytes;
1927         unsigned int rs, re;
1928         int count = 0;
1929
1930         if (!block_group || !bytes)
1931                 return count;
1932
1933         bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
1934                                    BITS_PER_BITMAP) {
1935                 bytes -= (rs - re) * ctl->unit;
1936                 count++;
1937
1938                 if (!bytes)
1939                         break;
1940         }
1941
1942         return count;
1943 }
1944
1945 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1946                            struct btrfs_free_space *info, u64 offset)
1947 {
1948         info->offset = offset_to_bitmap(ctl, offset);
1949         info->bytes = 0;
1950         info->bitmap_extents = 0;
1951         INIT_LIST_HEAD(&info->list);
1952         link_free_space(ctl, info);
1953         ctl->total_bitmaps++;
1954
1955         ctl->op->recalc_thresholds(ctl);
1956 }
1957
1958 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1959                         struct btrfs_free_space *bitmap_info)
1960 {
1961         /*
1962          * Normally when this is called, the bitmap is completely empty. However,
1963          * if we are blowing up the free space cache for one reason or another
1964          * via __btrfs_remove_free_space_cache(), then it may not be freed and
1965          * we may leave stats on the table.
1966          */
1967         if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
1968                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
1969                         bitmap_info->bitmap_extents;
1970                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
1971
1972         }
1973         unlink_free_space(ctl, bitmap_info);
1974         kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1975         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1976         ctl->total_bitmaps--;
1977         ctl->op->recalc_thresholds(ctl);
1978 }
1979
1980 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1981                               struct btrfs_free_space *bitmap_info,
1982                               u64 *offset, u64 *bytes)
1983 {
1984         u64 end;
1985         u64 search_start, search_bytes;
1986         int ret;
1987
1988 again:
1989         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1990
1991         /*
1992          * We need to search for bits in this bitmap.  We could only cover some
1993          * of the extent in this bitmap thanks to how we add space, so we need
1994          * to search for as much as it as we can and clear that amount, and then
1995          * go searching for the next bit.
1996          */
1997         search_start = *offset;
1998         search_bytes = ctl->unit;
1999         search_bytes = min(search_bytes, end - search_start + 1);
2000         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2001                             false);
2002         if (ret < 0 || search_start != *offset)
2003                 return -EINVAL;
2004
2005         /* We may have found more bits than what we need */
2006         search_bytes = min(search_bytes, *bytes);
2007
2008         /* Cannot clear past the end of the bitmap */
2009         search_bytes = min(search_bytes, end - search_start + 1);
2010
2011         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
2012         *offset += search_bytes;
2013         *bytes -= search_bytes;
2014
2015         if (*bytes) {
2016                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
2017                 if (!bitmap_info->bytes)
2018                         free_bitmap(ctl, bitmap_info);
2019
2020                 /*
2021                  * no entry after this bitmap, but we still have bytes to
2022                  * remove, so something has gone wrong.
2023                  */
2024                 if (!next)
2025                         return -EINVAL;
2026
2027                 bitmap_info = rb_entry(next, struct btrfs_free_space,
2028                                        offset_index);
2029
2030                 /*
2031                  * if the next entry isn't a bitmap we need to return to let the
2032                  * extent stuff do its work.
2033                  */
2034                 if (!bitmap_info->bitmap)
2035                         return -EAGAIN;
2036
2037                 /*
2038                  * Ok the next item is a bitmap, but it may not actually hold
2039                  * the information for the rest of this free space stuff, so
2040                  * look for it, and if we don't find it return so we can try
2041                  * everything over again.
2042                  */
2043                 search_start = *offset;
2044                 search_bytes = ctl->unit;
2045                 ret = search_bitmap(ctl, bitmap_info, &search_start,
2046                                     &search_bytes, false);
2047                 if (ret < 0 || search_start != *offset)
2048                         return -EAGAIN;
2049
2050                 goto again;
2051         } else if (!bitmap_info->bytes)
2052                 free_bitmap(ctl, bitmap_info);
2053
2054         return 0;
2055 }
2056
2057 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2058                                struct btrfs_free_space *info, u64 offset,
2059                                u64 bytes, enum btrfs_trim_state trim_state)
2060 {
2061         u64 bytes_to_set = 0;
2062         u64 end;
2063
2064         /*
2065          * This is a tradeoff to make bitmap trim state minimal.  We mark the
2066          * whole bitmap untrimmed if at any point we add untrimmed regions.
2067          */
2068         if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2069                 if (btrfs_free_space_trimmed(info)) {
2070                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
2071                                 info->bitmap_extents;
2072                         ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2073                 }
2074                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2075         }
2076
2077         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2078
2079         bytes_to_set = min(end - offset, bytes);
2080
2081         bitmap_set_bits(ctl, info, offset, bytes_to_set);
2082
2083         /*
2084          * We set some bytes, we have no idea what the max extent size is
2085          * anymore.
2086          */
2087         info->max_extent_size = 0;
2088
2089         return bytes_to_set;
2090
2091 }
2092
2093 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2094                       struct btrfs_free_space *info)
2095 {
2096         struct btrfs_block_group *block_group = ctl->private;
2097         struct btrfs_fs_info *fs_info = block_group->fs_info;
2098         bool forced = false;
2099
2100 #ifdef CONFIG_BTRFS_DEBUG
2101         if (btrfs_should_fragment_free_space(block_group))
2102                 forced = true;
2103 #endif
2104
2105         /* This is a way to reclaim large regions from the bitmaps. */
2106         if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2107                 return false;
2108
2109         /*
2110          * If we are below the extents threshold then we can add this as an
2111          * extent, and don't have to deal with the bitmap
2112          */
2113         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2114                 /*
2115                  * If this block group has some small extents we don't want to
2116                  * use up all of our free slots in the cache with them, we want
2117                  * to reserve them to larger extents, however if we have plenty
2118                  * of cache left then go ahead an dadd them, no sense in adding
2119                  * the overhead of a bitmap if we don't have to.
2120                  */
2121                 if (info->bytes <= fs_info->sectorsize * 8) {
2122                         if (ctl->free_extents * 3 <= ctl->extents_thresh)
2123                                 return false;
2124                 } else {
2125                         return false;
2126                 }
2127         }
2128
2129         /*
2130          * The original block groups from mkfs can be really small, like 8
2131          * megabytes, so don't bother with a bitmap for those entries.  However
2132          * some block groups can be smaller than what a bitmap would cover but
2133          * are still large enough that they could overflow the 32k memory limit,
2134          * so allow those block groups to still be allowed to have a bitmap
2135          * entry.
2136          */
2137         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2138                 return false;
2139
2140         return true;
2141 }
2142
2143 static const struct btrfs_free_space_op free_space_op = {
2144         .recalc_thresholds      = recalculate_thresholds,
2145         .use_bitmap             = use_bitmap,
2146 };
2147
2148 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2149                               struct btrfs_free_space *info)
2150 {
2151         struct btrfs_free_space *bitmap_info;
2152         struct btrfs_block_group *block_group = NULL;
2153         int added = 0;
2154         u64 bytes, offset, bytes_added;
2155         enum btrfs_trim_state trim_state;
2156         int ret;
2157
2158         bytes = info->bytes;
2159         offset = info->offset;
2160         trim_state = info->trim_state;
2161
2162         if (!ctl->op->use_bitmap(ctl, info))
2163                 return 0;
2164
2165         if (ctl->op == &free_space_op)
2166                 block_group = ctl->private;
2167 again:
2168         /*
2169          * Since we link bitmaps right into the cluster we need to see if we
2170          * have a cluster here, and if so and it has our bitmap we need to add
2171          * the free space to that bitmap.
2172          */
2173         if (block_group && !list_empty(&block_group->cluster_list)) {
2174                 struct btrfs_free_cluster *cluster;
2175                 struct rb_node *node;
2176                 struct btrfs_free_space *entry;
2177
2178                 cluster = list_entry(block_group->cluster_list.next,
2179                                      struct btrfs_free_cluster,
2180                                      block_group_list);
2181                 spin_lock(&cluster->lock);
2182                 node = rb_first(&cluster->root);
2183                 if (!node) {
2184                         spin_unlock(&cluster->lock);
2185                         goto no_cluster_bitmap;
2186                 }
2187
2188                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2189                 if (!entry->bitmap) {
2190                         spin_unlock(&cluster->lock);
2191                         goto no_cluster_bitmap;
2192                 }
2193
2194                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2195                         bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2196                                                           bytes, trim_state);
2197                         bytes -= bytes_added;
2198                         offset += bytes_added;
2199                 }
2200                 spin_unlock(&cluster->lock);
2201                 if (!bytes) {
2202                         ret = 1;
2203                         goto out;
2204                 }
2205         }
2206
2207 no_cluster_bitmap:
2208         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2209                                          1, 0);
2210         if (!bitmap_info) {
2211                 ASSERT(added == 0);
2212                 goto new_bitmap;
2213         }
2214
2215         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2216                                           trim_state);
2217         bytes -= bytes_added;
2218         offset += bytes_added;
2219         added = 0;
2220
2221         if (!bytes) {
2222                 ret = 1;
2223                 goto out;
2224         } else
2225                 goto again;
2226
2227 new_bitmap:
2228         if (info && info->bitmap) {
2229                 add_new_bitmap(ctl, info, offset);
2230                 added = 1;
2231                 info = NULL;
2232                 goto again;
2233         } else {
2234                 spin_unlock(&ctl->tree_lock);
2235
2236                 /* no pre-allocated info, allocate a new one */
2237                 if (!info) {
2238                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2239                                                  GFP_NOFS);
2240                         if (!info) {
2241                                 spin_lock(&ctl->tree_lock);
2242                                 ret = -ENOMEM;
2243                                 goto out;
2244                         }
2245                 }
2246
2247                 /* allocate the bitmap */
2248                 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2249                                                  GFP_NOFS);
2250                 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2251                 spin_lock(&ctl->tree_lock);
2252                 if (!info->bitmap) {
2253                         ret = -ENOMEM;
2254                         goto out;
2255                 }
2256                 goto again;
2257         }
2258
2259 out:
2260         if (info) {
2261                 if (info->bitmap)
2262                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2263                                         info->bitmap);
2264                 kmem_cache_free(btrfs_free_space_cachep, info);
2265         }
2266
2267         return ret;
2268 }
2269
2270 /*
2271  * Free space merging rules:
2272  *  1) Merge trimmed areas together
2273  *  2) Let untrimmed areas coalesce with trimmed areas
2274  *  3) Always pull neighboring regions from bitmaps
2275  *
2276  * The above rules are for when we merge free space based on btrfs_trim_state.
2277  * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2278  * same reason: to promote larger extent regions which makes life easier for
2279  * find_free_extent().  Rule 2 enables coalescing based on the common path
2280  * being returning free space from btrfs_finish_extent_commit().  So when free
2281  * space is trimmed, it will prevent aggregating trimmed new region and
2282  * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2283  * and provide find_free_extent() with the largest extents possible hoping for
2284  * the reuse path.
2285  */
2286 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2287                           struct btrfs_free_space *info, bool update_stat)
2288 {
2289         struct btrfs_free_space *left_info;
2290         struct btrfs_free_space *right_info;
2291         bool merged = false;
2292         u64 offset = info->offset;
2293         u64 bytes = info->bytes;
2294         const bool is_trimmed = btrfs_free_space_trimmed(info);
2295
2296         /*
2297          * first we want to see if there is free space adjacent to the range we
2298          * are adding, if there is remove that struct and add a new one to
2299          * cover the entire range
2300          */
2301         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2302         if (right_info && rb_prev(&right_info->offset_index))
2303                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2304                                      struct btrfs_free_space, offset_index);
2305         else
2306                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2307
2308         /* See try_merge_free_space() comment. */
2309         if (right_info && !right_info->bitmap &&
2310             (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2311                 if (update_stat)
2312                         unlink_free_space(ctl, right_info);
2313                 else
2314                         __unlink_free_space(ctl, right_info);
2315                 info->bytes += right_info->bytes;
2316                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2317                 merged = true;
2318         }
2319
2320         /* See try_merge_free_space() comment. */
2321         if (left_info && !left_info->bitmap &&
2322             left_info->offset + left_info->bytes == offset &&
2323             (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2324                 if (update_stat)
2325                         unlink_free_space(ctl, left_info);
2326                 else
2327                         __unlink_free_space(ctl, left_info);
2328                 info->offset = left_info->offset;
2329                 info->bytes += left_info->bytes;
2330                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2331                 merged = true;
2332         }
2333
2334         return merged;
2335 }
2336
2337 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2338                                      struct btrfs_free_space *info,
2339                                      bool update_stat)
2340 {
2341         struct btrfs_free_space *bitmap;
2342         unsigned long i;
2343         unsigned long j;
2344         const u64 end = info->offset + info->bytes;
2345         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2346         u64 bytes;
2347
2348         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2349         if (!bitmap)
2350                 return false;
2351
2352         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2353         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2354         if (j == i)
2355                 return false;
2356         bytes = (j - i) * ctl->unit;
2357         info->bytes += bytes;
2358
2359         /* See try_merge_free_space() comment. */
2360         if (!btrfs_free_space_trimmed(bitmap))
2361                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2362
2363         if (update_stat)
2364                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2365         else
2366                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2367
2368         if (!bitmap->bytes)
2369                 free_bitmap(ctl, bitmap);
2370
2371         return true;
2372 }
2373
2374 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2375                                        struct btrfs_free_space *info,
2376                                        bool update_stat)
2377 {
2378         struct btrfs_free_space *bitmap;
2379         u64 bitmap_offset;
2380         unsigned long i;
2381         unsigned long j;
2382         unsigned long prev_j;
2383         u64 bytes;
2384
2385         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2386         /* If we're on a boundary, try the previous logical bitmap. */
2387         if (bitmap_offset == info->offset) {
2388                 if (info->offset == 0)
2389                         return false;
2390                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2391         }
2392
2393         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2394         if (!bitmap)
2395                 return false;
2396
2397         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2398         j = 0;
2399         prev_j = (unsigned long)-1;
2400         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2401                 if (j > i)
2402                         break;
2403                 prev_j = j;
2404         }
2405         if (prev_j == i)
2406                 return false;
2407
2408         if (prev_j == (unsigned long)-1)
2409                 bytes = (i + 1) * ctl->unit;
2410         else
2411                 bytes = (i - prev_j) * ctl->unit;
2412
2413         info->offset -= bytes;
2414         info->bytes += bytes;
2415
2416         /* See try_merge_free_space() comment. */
2417         if (!btrfs_free_space_trimmed(bitmap))
2418                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2419
2420         if (update_stat)
2421                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2422         else
2423                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2424
2425         if (!bitmap->bytes)
2426                 free_bitmap(ctl, bitmap);
2427
2428         return true;
2429 }
2430
2431 /*
2432  * We prefer always to allocate from extent entries, both for clustered and
2433  * non-clustered allocation requests. So when attempting to add a new extent
2434  * entry, try to see if there's adjacent free space in bitmap entries, and if
2435  * there is, migrate that space from the bitmaps to the extent.
2436  * Like this we get better chances of satisfying space allocation requests
2437  * because we attempt to satisfy them based on a single cache entry, and never
2438  * on 2 or more entries - even if the entries represent a contiguous free space
2439  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2440  * ends).
2441  */
2442 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2443                               struct btrfs_free_space *info,
2444                               bool update_stat)
2445 {
2446         /*
2447          * Only work with disconnected entries, as we can change their offset,
2448          * and must be extent entries.
2449          */
2450         ASSERT(!info->bitmap);
2451         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2452
2453         if (ctl->total_bitmaps > 0) {
2454                 bool stole_end;
2455                 bool stole_front = false;
2456
2457                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2458                 if (ctl->total_bitmaps > 0)
2459                         stole_front = steal_from_bitmap_to_front(ctl, info,
2460                                                                  update_stat);
2461
2462                 if (stole_end || stole_front)
2463                         try_merge_free_space(ctl, info, update_stat);
2464         }
2465 }
2466
2467 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2468                            struct btrfs_free_space_ctl *ctl,
2469                            u64 offset, u64 bytes,
2470                            enum btrfs_trim_state trim_state)
2471 {
2472         struct btrfs_block_group *block_group = ctl->private;
2473         struct btrfs_free_space *info;
2474         int ret = 0;
2475         u64 filter_bytes = bytes;
2476
2477         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2478         if (!info)
2479                 return -ENOMEM;
2480
2481         info->offset = offset;
2482         info->bytes = bytes;
2483         info->trim_state = trim_state;
2484         RB_CLEAR_NODE(&info->offset_index);
2485
2486         spin_lock(&ctl->tree_lock);
2487
2488         if (try_merge_free_space(ctl, info, true))
2489                 goto link;
2490
2491         /*
2492          * There was no extent directly to the left or right of this new
2493          * extent then we know we're going to have to allocate a new extent, so
2494          * before we do that see if we need to drop this into a bitmap
2495          */
2496         ret = insert_into_bitmap(ctl, info);
2497         if (ret < 0) {
2498                 goto out;
2499         } else if (ret) {
2500                 ret = 0;
2501                 goto out;
2502         }
2503 link:
2504         /*
2505          * Only steal free space from adjacent bitmaps if we're sure we're not
2506          * going to add the new free space to existing bitmap entries - because
2507          * that would mean unnecessary work that would be reverted. Therefore
2508          * attempt to steal space from bitmaps if we're adding an extent entry.
2509          */
2510         steal_from_bitmap(ctl, info, true);
2511
2512         filter_bytes = max(filter_bytes, info->bytes);
2513
2514         ret = link_free_space(ctl, info);
2515         if (ret)
2516                 kmem_cache_free(btrfs_free_space_cachep, info);
2517 out:
2518         btrfs_discard_update_discardable(block_group, ctl);
2519         spin_unlock(&ctl->tree_lock);
2520
2521         if (ret) {
2522                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2523                 ASSERT(ret != -EEXIST);
2524         }
2525
2526         if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2527                 btrfs_discard_check_filter(block_group, filter_bytes);
2528                 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2529         }
2530
2531         return ret;
2532 }
2533
2534 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2535                          u64 bytenr, u64 size)
2536 {
2537         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2538
2539         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2540                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2541
2542         return __btrfs_add_free_space(block_group->fs_info,
2543                                       block_group->free_space_ctl,
2544                                       bytenr, size, trim_state);
2545 }
2546
2547 /*
2548  * This is a subtle distinction because when adding free space back in general,
2549  * we want it to be added as untrimmed for async. But in the case where we add
2550  * it on loading of a block group, we want to consider it trimmed.
2551  */
2552 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2553                                        u64 bytenr, u64 size)
2554 {
2555         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2556
2557         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2558             btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2559                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2560
2561         return __btrfs_add_free_space(block_group->fs_info,
2562                                       block_group->free_space_ctl,
2563                                       bytenr, size, trim_state);
2564 }
2565
2566 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2567                             u64 offset, u64 bytes)
2568 {
2569         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2570         struct btrfs_free_space *info;
2571         int ret;
2572         bool re_search = false;
2573
2574         spin_lock(&ctl->tree_lock);
2575
2576 again:
2577         ret = 0;
2578         if (!bytes)
2579                 goto out_lock;
2580
2581         info = tree_search_offset(ctl, offset, 0, 0);
2582         if (!info) {
2583                 /*
2584                  * oops didn't find an extent that matched the space we wanted
2585                  * to remove, look for a bitmap instead
2586                  */
2587                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2588                                           1, 0);
2589                 if (!info) {
2590                         /*
2591                          * If we found a partial bit of our free space in a
2592                          * bitmap but then couldn't find the other part this may
2593                          * be a problem, so WARN about it.
2594                          */
2595                         WARN_ON(re_search);
2596                         goto out_lock;
2597                 }
2598         }
2599
2600         re_search = false;
2601         if (!info->bitmap) {
2602                 unlink_free_space(ctl, info);
2603                 if (offset == info->offset) {
2604                         u64 to_free = min(bytes, info->bytes);
2605
2606                         info->bytes -= to_free;
2607                         info->offset += to_free;
2608                         if (info->bytes) {
2609                                 ret = link_free_space(ctl, info);
2610                                 WARN_ON(ret);
2611                         } else {
2612                                 kmem_cache_free(btrfs_free_space_cachep, info);
2613                         }
2614
2615                         offset += to_free;
2616                         bytes -= to_free;
2617                         goto again;
2618                 } else {
2619                         u64 old_end = info->bytes + info->offset;
2620
2621                         info->bytes = offset - info->offset;
2622                         ret = link_free_space(ctl, info);
2623                         WARN_ON(ret);
2624                         if (ret)
2625                                 goto out_lock;
2626
2627                         /* Not enough bytes in this entry to satisfy us */
2628                         if (old_end < offset + bytes) {
2629                                 bytes -= old_end - offset;
2630                                 offset = old_end;
2631                                 goto again;
2632                         } else if (old_end == offset + bytes) {
2633                                 /* all done */
2634                                 goto out_lock;
2635                         }
2636                         spin_unlock(&ctl->tree_lock);
2637
2638                         ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2639                                                      offset + bytes,
2640                                                      old_end - (offset + bytes),
2641                                                      info->trim_state);
2642                         WARN_ON(ret);
2643                         goto out;
2644                 }
2645         }
2646
2647         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2648         if (ret == -EAGAIN) {
2649                 re_search = true;
2650                 goto again;
2651         }
2652 out_lock:
2653         btrfs_discard_update_discardable(block_group, ctl);
2654         spin_unlock(&ctl->tree_lock);
2655 out:
2656         return ret;
2657 }
2658
2659 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2660                            u64 bytes)
2661 {
2662         struct btrfs_fs_info *fs_info = block_group->fs_info;
2663         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2664         struct btrfs_free_space *info;
2665         struct rb_node *n;
2666         int count = 0;
2667
2668         spin_lock(&ctl->tree_lock);
2669         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2670                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2671                 if (info->bytes >= bytes && !block_group->ro)
2672                         count++;
2673                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2674                            info->offset, info->bytes,
2675                        (info->bitmap) ? "yes" : "no");
2676         }
2677         spin_unlock(&ctl->tree_lock);
2678         btrfs_info(fs_info, "block group has cluster?: %s",
2679                list_empty(&block_group->cluster_list) ? "no" : "yes");
2680         btrfs_info(fs_info,
2681                    "%d blocks of free space at or bigger than bytes is", count);
2682 }
2683
2684 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
2685 {
2686         struct btrfs_fs_info *fs_info = block_group->fs_info;
2687         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2688
2689         spin_lock_init(&ctl->tree_lock);
2690         ctl->unit = fs_info->sectorsize;
2691         ctl->start = block_group->start;
2692         ctl->private = block_group;
2693         ctl->op = &free_space_op;
2694         INIT_LIST_HEAD(&ctl->trimming_ranges);
2695         mutex_init(&ctl->cache_writeout_mutex);
2696
2697         /*
2698          * we only want to have 32k of ram per block group for keeping
2699          * track of free space, and if we pass 1/2 of that we want to
2700          * start converting things over to using bitmaps
2701          */
2702         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2703 }
2704
2705 /*
2706  * for a given cluster, put all of its extents back into the free
2707  * space cache.  If the block group passed doesn't match the block group
2708  * pointed to by the cluster, someone else raced in and freed the
2709  * cluster already.  In that case, we just return without changing anything
2710  */
2711 static int
2712 __btrfs_return_cluster_to_free_space(
2713                              struct btrfs_block_group *block_group,
2714                              struct btrfs_free_cluster *cluster)
2715 {
2716         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2717         struct btrfs_free_space *entry;
2718         struct rb_node *node;
2719
2720         spin_lock(&cluster->lock);
2721         if (cluster->block_group != block_group)
2722                 goto out;
2723
2724         cluster->block_group = NULL;
2725         cluster->window_start = 0;
2726         list_del_init(&cluster->block_group_list);
2727
2728         node = rb_first(&cluster->root);
2729         while (node) {
2730                 bool bitmap;
2731
2732                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2733                 node = rb_next(&entry->offset_index);
2734                 rb_erase(&entry->offset_index, &cluster->root);
2735                 RB_CLEAR_NODE(&entry->offset_index);
2736
2737                 bitmap = (entry->bitmap != NULL);
2738                 if (!bitmap) {
2739                         /* Merging treats extents as if they were new */
2740                         if (!btrfs_free_space_trimmed(entry)) {
2741                                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
2742                                 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2743                                         entry->bytes;
2744                         }
2745
2746                         try_merge_free_space(ctl, entry, false);
2747                         steal_from_bitmap(ctl, entry, false);
2748
2749                         /* As we insert directly, update these statistics */
2750                         if (!btrfs_free_space_trimmed(entry)) {
2751                                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
2752                                 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2753                                         entry->bytes;
2754                         }
2755                 }
2756                 tree_insert_offset(&ctl->free_space_offset,
2757                                    entry->offset, &entry->offset_index, bitmap);
2758         }
2759         cluster->root = RB_ROOT;
2760
2761 out:
2762         spin_unlock(&cluster->lock);
2763         btrfs_put_block_group(block_group);
2764         return 0;
2765 }
2766
2767 static void __btrfs_remove_free_space_cache_locked(
2768                                 struct btrfs_free_space_ctl *ctl)
2769 {
2770         struct btrfs_free_space *info;
2771         struct rb_node *node;
2772
2773         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2774                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2775                 if (!info->bitmap) {
2776                         unlink_free_space(ctl, info);
2777                         kmem_cache_free(btrfs_free_space_cachep, info);
2778                 } else {
2779                         free_bitmap(ctl, info);
2780                 }
2781
2782                 cond_resched_lock(&ctl->tree_lock);
2783         }
2784 }
2785
2786 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2787 {
2788         spin_lock(&ctl->tree_lock);
2789         __btrfs_remove_free_space_cache_locked(ctl);
2790         if (ctl->private)
2791                 btrfs_discard_update_discardable(ctl->private, ctl);
2792         spin_unlock(&ctl->tree_lock);
2793 }
2794
2795 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2796 {
2797         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2798         struct btrfs_free_cluster *cluster;
2799         struct list_head *head;
2800
2801         spin_lock(&ctl->tree_lock);
2802         while ((head = block_group->cluster_list.next) !=
2803                &block_group->cluster_list) {
2804                 cluster = list_entry(head, struct btrfs_free_cluster,
2805                                      block_group_list);
2806
2807                 WARN_ON(cluster->block_group != block_group);
2808                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2809
2810                 cond_resched_lock(&ctl->tree_lock);
2811         }
2812         __btrfs_remove_free_space_cache_locked(ctl);
2813         btrfs_discard_update_discardable(block_group, ctl);
2814         spin_unlock(&ctl->tree_lock);
2815
2816 }
2817
2818 /**
2819  * btrfs_is_free_space_trimmed - see if everything is trimmed
2820  * @block_group: block_group of interest
2821  *
2822  * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2823  */
2824 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2825 {
2826         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2827         struct btrfs_free_space *info;
2828         struct rb_node *node;
2829         bool ret = true;
2830
2831         spin_lock(&ctl->tree_lock);
2832         node = rb_first(&ctl->free_space_offset);
2833
2834         while (node) {
2835                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2836
2837                 if (!btrfs_free_space_trimmed(info)) {
2838                         ret = false;
2839                         break;
2840                 }
2841
2842                 node = rb_next(node);
2843         }
2844
2845         spin_unlock(&ctl->tree_lock);
2846         return ret;
2847 }
2848
2849 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2850                                u64 offset, u64 bytes, u64 empty_size,
2851                                u64 *max_extent_size)
2852 {
2853         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2854         struct btrfs_discard_ctl *discard_ctl =
2855                                         &block_group->fs_info->discard_ctl;
2856         struct btrfs_free_space *entry = NULL;
2857         u64 bytes_search = bytes + empty_size;
2858         u64 ret = 0;
2859         u64 align_gap = 0;
2860         u64 align_gap_len = 0;
2861         enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2862
2863         spin_lock(&ctl->tree_lock);
2864         entry = find_free_space(ctl, &offset, &bytes_search,
2865                                 block_group->full_stripe_len, max_extent_size);
2866         if (!entry)
2867                 goto out;
2868
2869         ret = offset;
2870         if (entry->bitmap) {
2871                 bitmap_clear_bits(ctl, entry, offset, bytes);
2872
2873                 if (!btrfs_free_space_trimmed(entry))
2874                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2875
2876                 if (!entry->bytes)
2877                         free_bitmap(ctl, entry);
2878         } else {
2879                 unlink_free_space(ctl, entry);
2880                 align_gap_len = offset - entry->offset;
2881                 align_gap = entry->offset;
2882                 align_gap_trim_state = entry->trim_state;
2883
2884                 if (!btrfs_free_space_trimmed(entry))
2885                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2886
2887                 entry->offset = offset + bytes;
2888                 WARN_ON(entry->bytes < bytes + align_gap_len);
2889
2890                 entry->bytes -= bytes + align_gap_len;
2891                 if (!entry->bytes)
2892                         kmem_cache_free(btrfs_free_space_cachep, entry);
2893                 else
2894                         link_free_space(ctl, entry);
2895         }
2896 out:
2897         btrfs_discard_update_discardable(block_group, ctl);
2898         spin_unlock(&ctl->tree_lock);
2899
2900         if (align_gap_len)
2901                 __btrfs_add_free_space(block_group->fs_info, ctl,
2902                                        align_gap, align_gap_len,
2903                                        align_gap_trim_state);
2904         return ret;
2905 }
2906
2907 /*
2908  * given a cluster, put all of its extents back into the free space
2909  * cache.  If a block group is passed, this function will only free
2910  * a cluster that belongs to the passed block group.
2911  *
2912  * Otherwise, it'll get a reference on the block group pointed to by the
2913  * cluster and remove the cluster from it.
2914  */
2915 int btrfs_return_cluster_to_free_space(
2916                                struct btrfs_block_group *block_group,
2917                                struct btrfs_free_cluster *cluster)
2918 {
2919         struct btrfs_free_space_ctl *ctl;
2920         int ret;
2921
2922         /* first, get a safe pointer to the block group */
2923         spin_lock(&cluster->lock);
2924         if (!block_group) {
2925                 block_group = cluster->block_group;
2926                 if (!block_group) {
2927                         spin_unlock(&cluster->lock);
2928                         return 0;
2929                 }
2930         } else if (cluster->block_group != block_group) {
2931                 /* someone else has already freed it don't redo their work */
2932                 spin_unlock(&cluster->lock);
2933                 return 0;
2934         }
2935         atomic_inc(&block_group->count);
2936         spin_unlock(&cluster->lock);
2937
2938         ctl = block_group->free_space_ctl;
2939
2940         /* now return any extents the cluster had on it */
2941         spin_lock(&ctl->tree_lock);
2942         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2943         spin_unlock(&ctl->tree_lock);
2944
2945         btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
2946
2947         /* finally drop our ref */
2948         btrfs_put_block_group(block_group);
2949         return ret;
2950 }
2951
2952 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
2953                                    struct btrfs_free_cluster *cluster,
2954                                    struct btrfs_free_space *entry,
2955                                    u64 bytes, u64 min_start,
2956                                    u64 *max_extent_size)
2957 {
2958         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2959         int err;
2960         u64 search_start = cluster->window_start;
2961         u64 search_bytes = bytes;
2962         u64 ret = 0;
2963
2964         search_start = min_start;
2965         search_bytes = bytes;
2966
2967         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2968         if (err) {
2969                 *max_extent_size = max(get_max_extent_size(entry),
2970                                        *max_extent_size);
2971                 return 0;
2972         }
2973
2974         ret = search_start;
2975         __bitmap_clear_bits(ctl, entry, ret, bytes);
2976
2977         return ret;
2978 }
2979
2980 /*
2981  * given a cluster, try to allocate 'bytes' from it, returns 0
2982  * if it couldn't find anything suitably large, or a logical disk offset
2983  * if things worked out
2984  */
2985 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
2986                              struct btrfs_free_cluster *cluster, u64 bytes,
2987                              u64 min_start, u64 *max_extent_size)
2988 {
2989         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2990         struct btrfs_discard_ctl *discard_ctl =
2991                                         &block_group->fs_info->discard_ctl;
2992         struct btrfs_free_space *entry = NULL;
2993         struct rb_node *node;
2994         u64 ret = 0;
2995
2996         spin_lock(&cluster->lock);
2997         if (bytes > cluster->max_size)
2998                 goto out;
2999
3000         if (cluster->block_group != block_group)
3001                 goto out;
3002
3003         node = rb_first(&cluster->root);
3004         if (!node)
3005                 goto out;
3006
3007         entry = rb_entry(node, struct btrfs_free_space, offset_index);
3008         while (1) {
3009                 if (entry->bytes < bytes)
3010                         *max_extent_size = max(get_max_extent_size(entry),
3011                                                *max_extent_size);
3012
3013                 if (entry->bytes < bytes ||
3014                     (!entry->bitmap && entry->offset < min_start)) {
3015                         node = rb_next(&entry->offset_index);
3016                         if (!node)
3017                                 break;
3018                         entry = rb_entry(node, struct btrfs_free_space,
3019                                          offset_index);
3020                         continue;
3021                 }
3022
3023                 if (entry->bitmap) {
3024                         ret = btrfs_alloc_from_bitmap(block_group,
3025                                                       cluster, entry, bytes,
3026                                                       cluster->window_start,
3027                                                       max_extent_size);
3028                         if (ret == 0) {
3029                                 node = rb_next(&entry->offset_index);
3030                                 if (!node)
3031                                         break;
3032                                 entry = rb_entry(node, struct btrfs_free_space,
3033                                                  offset_index);
3034                                 continue;
3035                         }
3036                         cluster->window_start += bytes;
3037                 } else {
3038                         ret = entry->offset;
3039
3040                         entry->offset += bytes;
3041                         entry->bytes -= bytes;
3042                 }
3043
3044                 if (entry->bytes == 0)
3045                         rb_erase(&entry->offset_index, &cluster->root);
3046                 break;
3047         }
3048 out:
3049         spin_unlock(&cluster->lock);
3050
3051         if (!ret)
3052                 return 0;
3053
3054         spin_lock(&ctl->tree_lock);
3055
3056         if (!btrfs_free_space_trimmed(entry))
3057                 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3058
3059         ctl->free_space -= bytes;
3060         if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3061                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3062         if (entry->bytes == 0) {
3063                 ctl->free_extents--;
3064                 if (entry->bitmap) {
3065                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
3066                                         entry->bitmap);
3067                         ctl->total_bitmaps--;
3068                         ctl->op->recalc_thresholds(ctl);
3069                 } else if (!btrfs_free_space_trimmed(entry)) {
3070                         ctl->discardable_extents[BTRFS_STAT_CURR]--;
3071                 }
3072                 kmem_cache_free(btrfs_free_space_cachep, entry);
3073         }
3074
3075         spin_unlock(&ctl->tree_lock);
3076
3077         return ret;
3078 }
3079
3080 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3081                                 struct btrfs_free_space *entry,
3082                                 struct btrfs_free_cluster *cluster,
3083                                 u64 offset, u64 bytes,
3084                                 u64 cont1_bytes, u64 min_bytes)
3085 {
3086         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3087         unsigned long next_zero;
3088         unsigned long i;
3089         unsigned long want_bits;
3090         unsigned long min_bits;
3091         unsigned long found_bits;
3092         unsigned long max_bits = 0;
3093         unsigned long start = 0;
3094         unsigned long total_found = 0;
3095         int ret;
3096
3097         i = offset_to_bit(entry->offset, ctl->unit,
3098                           max_t(u64, offset, entry->offset));
3099         want_bits = bytes_to_bits(bytes, ctl->unit);
3100         min_bits = bytes_to_bits(min_bytes, ctl->unit);
3101
3102         /*
3103          * Don't bother looking for a cluster in this bitmap if it's heavily
3104          * fragmented.
3105          */
3106         if (entry->max_extent_size &&
3107             entry->max_extent_size < cont1_bytes)
3108                 return -ENOSPC;
3109 again:
3110         found_bits = 0;
3111         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3112                 next_zero = find_next_zero_bit(entry->bitmap,
3113                                                BITS_PER_BITMAP, i);
3114                 if (next_zero - i >= min_bits) {
3115                         found_bits = next_zero - i;
3116                         if (found_bits > max_bits)
3117                                 max_bits = found_bits;
3118                         break;
3119                 }
3120                 if (next_zero - i > max_bits)
3121                         max_bits = next_zero - i;
3122                 i = next_zero;
3123         }
3124
3125         if (!found_bits) {
3126                 entry->max_extent_size = (u64)max_bits * ctl->unit;
3127                 return -ENOSPC;
3128         }
3129
3130         if (!total_found) {
3131                 start = i;
3132                 cluster->max_size = 0;
3133         }
3134
3135         total_found += found_bits;
3136
3137         if (cluster->max_size < found_bits * ctl->unit)
3138                 cluster->max_size = found_bits * ctl->unit;
3139
3140         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3141                 i = next_zero + 1;
3142                 goto again;
3143         }
3144
3145         cluster->window_start = start * ctl->unit + entry->offset;
3146         rb_erase(&entry->offset_index, &ctl->free_space_offset);
3147         ret = tree_insert_offset(&cluster->root, entry->offset,
3148                                  &entry->offset_index, 1);
3149         ASSERT(!ret); /* -EEXIST; Logic error */
3150
3151         trace_btrfs_setup_cluster(block_group, cluster,
3152                                   total_found * ctl->unit, 1);
3153         return 0;
3154 }
3155
3156 /*
3157  * This searches the block group for just extents to fill the cluster with.
3158  * Try to find a cluster with at least bytes total bytes, at least one
3159  * extent of cont1_bytes, and other clusters of at least min_bytes.
3160  */
3161 static noinline int
3162 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3163                         struct btrfs_free_cluster *cluster,
3164                         struct list_head *bitmaps, u64 offset, u64 bytes,
3165                         u64 cont1_bytes, u64 min_bytes)
3166 {
3167         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3168         struct btrfs_free_space *first = NULL;
3169         struct btrfs_free_space *entry = NULL;
3170         struct btrfs_free_space *last;
3171         struct rb_node *node;
3172         u64 window_free;
3173         u64 max_extent;
3174         u64 total_size = 0;
3175
3176         entry = tree_search_offset(ctl, offset, 0, 1);
3177         if (!entry)
3178                 return -ENOSPC;
3179
3180         /*
3181          * We don't want bitmaps, so just move along until we find a normal
3182          * extent entry.
3183          */
3184         while (entry->bitmap || entry->bytes < min_bytes) {
3185                 if (entry->bitmap && list_empty(&entry->list))
3186                         list_add_tail(&entry->list, bitmaps);
3187                 node = rb_next(&entry->offset_index);
3188                 if (!node)
3189                         return -ENOSPC;
3190                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3191         }
3192
3193         window_free = entry->bytes;
3194         max_extent = entry->bytes;
3195         first = entry;
3196         last = entry;
3197
3198         for (node = rb_next(&entry->offset_index); node;
3199              node = rb_next(&entry->offset_index)) {
3200                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3201
3202                 if (entry->bitmap) {
3203                         if (list_empty(&entry->list))
3204                                 list_add_tail(&entry->list, bitmaps);
3205                         continue;
3206                 }
3207
3208                 if (entry->bytes < min_bytes)
3209                         continue;
3210
3211                 last = entry;
3212                 window_free += entry->bytes;
3213                 if (entry->bytes > max_extent)
3214                         max_extent = entry->bytes;
3215         }
3216
3217         if (window_free < bytes || max_extent < cont1_bytes)
3218                 return -ENOSPC;
3219
3220         cluster->window_start = first->offset;
3221
3222         node = &first->offset_index;
3223
3224         /*
3225          * now we've found our entries, pull them out of the free space
3226          * cache and put them into the cluster rbtree
3227          */
3228         do {
3229                 int ret;
3230
3231                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3232                 node = rb_next(&entry->offset_index);
3233                 if (entry->bitmap || entry->bytes < min_bytes)
3234                         continue;
3235
3236                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3237                 ret = tree_insert_offset(&cluster->root, entry->offset,
3238                                          &entry->offset_index, 0);
3239                 total_size += entry->bytes;
3240                 ASSERT(!ret); /* -EEXIST; Logic error */
3241         } while (node && entry != last);
3242
3243         cluster->max_size = max_extent;
3244         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3245         return 0;
3246 }
3247
3248 /*
3249  * This specifically looks for bitmaps that may work in the cluster, we assume
3250  * that we have already failed to find extents that will work.
3251  */
3252 static noinline int
3253 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3254                      struct btrfs_free_cluster *cluster,
3255                      struct list_head *bitmaps, u64 offset, u64 bytes,
3256                      u64 cont1_bytes, u64 min_bytes)
3257 {
3258         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3259         struct btrfs_free_space *entry = NULL;
3260         int ret = -ENOSPC;
3261         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3262
3263         if (ctl->total_bitmaps == 0)
3264                 return -ENOSPC;
3265
3266         /*
3267          * The bitmap that covers offset won't be in the list unless offset
3268          * is just its start offset.
3269          */
3270         if (!list_empty(bitmaps))
3271                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3272
3273         if (!entry || entry->offset != bitmap_offset) {
3274                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3275                 if (entry && list_empty(&entry->list))
3276                         list_add(&entry->list, bitmaps);
3277         }
3278
3279         list_for_each_entry(entry, bitmaps, list) {
3280                 if (entry->bytes < bytes)
3281                         continue;
3282                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3283                                            bytes, cont1_bytes, min_bytes);
3284                 if (!ret)
3285                         return 0;
3286         }
3287
3288         /*
3289          * The bitmaps list has all the bitmaps that record free space
3290          * starting after offset, so no more search is required.
3291          */
3292         return -ENOSPC;
3293 }
3294
3295 /*
3296  * here we try to find a cluster of blocks in a block group.  The goal
3297  * is to find at least bytes+empty_size.
3298  * We might not find them all in one contiguous area.
3299  *
3300  * returns zero and sets up cluster if things worked out, otherwise
3301  * it returns -enospc
3302  */
3303 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3304                              struct btrfs_free_cluster *cluster,
3305                              u64 offset, u64 bytes, u64 empty_size)
3306 {
3307         struct btrfs_fs_info *fs_info = block_group->fs_info;
3308         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3309         struct btrfs_free_space *entry, *tmp;
3310         LIST_HEAD(bitmaps);
3311         u64 min_bytes;
3312         u64 cont1_bytes;
3313         int ret;
3314
3315         /*
3316          * Choose the minimum extent size we'll require for this
3317          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3318          * For metadata, allow allocates with smaller extents.  For
3319          * data, keep it dense.
3320          */
3321         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3322                 cont1_bytes = min_bytes = bytes + empty_size;
3323         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3324                 cont1_bytes = bytes;
3325                 min_bytes = fs_info->sectorsize;
3326         } else {
3327                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3328                 min_bytes = fs_info->sectorsize;
3329         }
3330
3331         spin_lock(&ctl->tree_lock);
3332
3333         /*
3334          * If we know we don't have enough space to make a cluster don't even
3335          * bother doing all the work to try and find one.
3336          */
3337         if (ctl->free_space < bytes) {
3338                 spin_unlock(&ctl->tree_lock);
3339                 return -ENOSPC;
3340         }
3341
3342         spin_lock(&cluster->lock);
3343
3344         /* someone already found a cluster, hooray */
3345         if (cluster->block_group) {
3346                 ret = 0;
3347                 goto out;
3348         }
3349
3350         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3351                                  min_bytes);
3352
3353         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3354                                       bytes + empty_size,
3355                                       cont1_bytes, min_bytes);
3356         if (ret)
3357                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3358                                            offset, bytes + empty_size,
3359                                            cont1_bytes, min_bytes);
3360
3361         /* Clear our temporary list */
3362         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3363                 list_del_init(&entry->list);
3364
3365         if (!ret) {
3366                 atomic_inc(&block_group->count);
3367                 list_add_tail(&cluster->block_group_list,
3368                               &block_group->cluster_list);
3369                 cluster->block_group = block_group;
3370         } else {
3371                 trace_btrfs_failed_cluster_setup(block_group);
3372         }
3373 out:
3374         spin_unlock(&cluster->lock);
3375         spin_unlock(&ctl->tree_lock);
3376
3377         return ret;
3378 }
3379
3380 /*
3381  * simple code to zero out a cluster
3382  */
3383 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3384 {
3385         spin_lock_init(&cluster->lock);
3386         spin_lock_init(&cluster->refill_lock);
3387         cluster->root = RB_ROOT;
3388         cluster->max_size = 0;
3389         cluster->fragmented = false;
3390         INIT_LIST_HEAD(&cluster->block_group_list);
3391         cluster->block_group = NULL;
3392 }
3393
3394 static int do_trimming(struct btrfs_block_group *block_group,
3395                        u64 *total_trimmed, u64 start, u64 bytes,
3396                        u64 reserved_start, u64 reserved_bytes,
3397                        enum btrfs_trim_state reserved_trim_state,
3398                        struct btrfs_trim_range *trim_entry)
3399 {
3400         struct btrfs_space_info *space_info = block_group->space_info;
3401         struct btrfs_fs_info *fs_info = block_group->fs_info;
3402         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3403         int ret;
3404         int update = 0;
3405         const u64 end = start + bytes;
3406         const u64 reserved_end = reserved_start + reserved_bytes;
3407         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3408         u64 trimmed = 0;
3409
3410         spin_lock(&space_info->lock);
3411         spin_lock(&block_group->lock);
3412         if (!block_group->ro) {
3413                 block_group->reserved += reserved_bytes;
3414                 space_info->bytes_reserved += reserved_bytes;
3415                 update = 1;
3416         }
3417         spin_unlock(&block_group->lock);
3418         spin_unlock(&space_info->lock);
3419
3420         ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3421         if (!ret) {
3422                 *total_trimmed += trimmed;
3423                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3424         }
3425
3426         mutex_lock(&ctl->cache_writeout_mutex);
3427         if (reserved_start < start)
3428                 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3429                                        start - reserved_start,
3430                                        reserved_trim_state);
3431         if (start + bytes < reserved_start + reserved_bytes)
3432                 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3433                                        reserved_trim_state);
3434         __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3435         list_del(&trim_entry->list);
3436         mutex_unlock(&ctl->cache_writeout_mutex);
3437
3438         if (update) {
3439                 spin_lock(&space_info->lock);
3440                 spin_lock(&block_group->lock);
3441                 if (block_group->ro)
3442                         space_info->bytes_readonly += reserved_bytes;
3443                 block_group->reserved -= reserved_bytes;
3444                 space_info->bytes_reserved -= reserved_bytes;
3445                 spin_unlock(&block_group->lock);
3446                 spin_unlock(&space_info->lock);
3447         }
3448
3449         return ret;
3450 }
3451
3452 /*
3453  * If @async is set, then we will trim 1 region and return.
3454  */
3455 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3456                           u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3457                           bool async)
3458 {
3459         struct btrfs_discard_ctl *discard_ctl =
3460                                         &block_group->fs_info->discard_ctl;
3461         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3462         struct btrfs_free_space *entry;
3463         struct rb_node *node;
3464         int ret = 0;
3465         u64 extent_start;
3466         u64 extent_bytes;
3467         enum btrfs_trim_state extent_trim_state;
3468         u64 bytes;
3469         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3470
3471         while (start < end) {
3472                 struct btrfs_trim_range trim_entry;
3473
3474                 mutex_lock(&ctl->cache_writeout_mutex);
3475                 spin_lock(&ctl->tree_lock);
3476
3477                 if (ctl->free_space < minlen)
3478                         goto out_unlock;
3479
3480                 entry = tree_search_offset(ctl, start, 0, 1);
3481                 if (!entry)
3482                         goto out_unlock;
3483
3484                 /* Skip bitmaps and if async, already trimmed entries */
3485                 while (entry->bitmap ||
3486                        (async && btrfs_free_space_trimmed(entry))) {
3487                         node = rb_next(&entry->offset_index);
3488                         if (!node)
3489                                 goto out_unlock;
3490                         entry = rb_entry(node, struct btrfs_free_space,
3491                                          offset_index);
3492                 }
3493
3494                 if (entry->offset >= end)
3495                         goto out_unlock;
3496
3497                 extent_start = entry->offset;
3498                 extent_bytes = entry->bytes;
3499                 extent_trim_state = entry->trim_state;
3500                 if (async) {
3501                         start = entry->offset;
3502                         bytes = entry->bytes;
3503                         if (bytes < minlen) {
3504                                 spin_unlock(&ctl->tree_lock);
3505                                 mutex_unlock(&ctl->cache_writeout_mutex);
3506                                 goto next;
3507                         }
3508                         unlink_free_space(ctl, entry);
3509                         /*
3510                          * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3511                          * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3512                          * X when we come back around.  So trim it now.
3513                          */
3514                         if (max_discard_size &&
3515                             bytes >= (max_discard_size +
3516                                       BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3517                                 bytes = max_discard_size;
3518                                 extent_bytes = max_discard_size;
3519                                 entry->offset += max_discard_size;
3520                                 entry->bytes -= max_discard_size;
3521                                 link_free_space(ctl, entry);
3522                         } else {
3523                                 kmem_cache_free(btrfs_free_space_cachep, entry);
3524                         }
3525                 } else {
3526                         start = max(start, extent_start);
3527                         bytes = min(extent_start + extent_bytes, end) - start;
3528                         if (bytes < minlen) {
3529                                 spin_unlock(&ctl->tree_lock);
3530                                 mutex_unlock(&ctl->cache_writeout_mutex);
3531                                 goto next;
3532                         }
3533
3534                         unlink_free_space(ctl, entry);
3535                         kmem_cache_free(btrfs_free_space_cachep, entry);
3536                 }
3537
3538                 spin_unlock(&ctl->tree_lock);
3539                 trim_entry.start = extent_start;
3540                 trim_entry.bytes = extent_bytes;
3541                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3542                 mutex_unlock(&ctl->cache_writeout_mutex);
3543
3544                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3545                                   extent_start, extent_bytes, extent_trim_state,
3546                                   &trim_entry);
3547                 if (ret) {
3548                         block_group->discard_cursor = start + bytes;
3549                         break;
3550                 }
3551 next:
3552                 start += bytes;
3553                 block_group->discard_cursor = start;
3554                 if (async && *total_trimmed)
3555                         break;
3556
3557                 if (fatal_signal_pending(current)) {
3558                         ret = -ERESTARTSYS;
3559                         break;
3560                 }
3561
3562                 cond_resched();
3563         }
3564
3565         return ret;
3566
3567 out_unlock:
3568         block_group->discard_cursor = btrfs_block_group_end(block_group);
3569         spin_unlock(&ctl->tree_lock);
3570         mutex_unlock(&ctl->cache_writeout_mutex);
3571
3572         return ret;
3573 }
3574
3575 /*
3576  * If we break out of trimming a bitmap prematurely, we should reset the
3577  * trimming bit.  In a rather contrieved case, it's possible to race here so
3578  * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3579  *
3580  * start = start of bitmap
3581  * end = near end of bitmap
3582  *
3583  * Thread 1:                    Thread 2:
3584  * trim_bitmaps(start)
3585  *                              trim_bitmaps(end)
3586  *                              end_trimming_bitmap()
3587  * reset_trimming_bitmap()
3588  */
3589 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3590 {
3591         struct btrfs_free_space *entry;
3592
3593         spin_lock(&ctl->tree_lock);
3594         entry = tree_search_offset(ctl, offset, 1, 0);
3595         if (entry) {
3596                 if (btrfs_free_space_trimmed(entry)) {
3597                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
3598                                 entry->bitmap_extents;
3599                         ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3600                 }
3601                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3602         }
3603
3604         spin_unlock(&ctl->tree_lock);
3605 }
3606
3607 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3608                                 struct btrfs_free_space *entry)
3609 {
3610         if (btrfs_free_space_trimming_bitmap(entry)) {
3611                 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3612                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3613                         entry->bitmap_extents;
3614                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3615         }
3616 }
3617
3618 /*
3619  * If @async is set, then we will trim 1 region and return.
3620  */
3621 static int trim_bitmaps(struct btrfs_block_group *block_group,
3622                         u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3623                         u64 maxlen, bool async)
3624 {
3625         struct btrfs_discard_ctl *discard_ctl =
3626                                         &block_group->fs_info->discard_ctl;
3627         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3628         struct btrfs_free_space *entry;
3629         int ret = 0;
3630         int ret2;
3631         u64 bytes;
3632         u64 offset = offset_to_bitmap(ctl, start);
3633         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3634
3635         while (offset < end) {
3636                 bool next_bitmap = false;
3637                 struct btrfs_trim_range trim_entry;
3638
3639                 mutex_lock(&ctl->cache_writeout_mutex);
3640                 spin_lock(&ctl->tree_lock);
3641
3642                 if (ctl->free_space < minlen) {
3643                         block_group->discard_cursor =
3644                                 btrfs_block_group_end(block_group);
3645                         spin_unlock(&ctl->tree_lock);
3646                         mutex_unlock(&ctl->cache_writeout_mutex);
3647                         break;
3648                 }
3649
3650                 entry = tree_search_offset(ctl, offset, 1, 0);
3651                 /*
3652                  * Bitmaps are marked trimmed lossily now to prevent constant
3653                  * discarding of the same bitmap (the reason why we are bound
3654                  * by the filters).  So, retrim the block group bitmaps when we
3655                  * are preparing to punt to the unused_bgs list.  This uses
3656                  * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3657                  * which is the only discard index which sets minlen to 0.
3658                  */
3659                 if (!entry || (async && minlen && start == offset &&
3660                                btrfs_free_space_trimmed(entry))) {
3661                         spin_unlock(&ctl->tree_lock);
3662                         mutex_unlock(&ctl->cache_writeout_mutex);
3663                         next_bitmap = true;
3664                         goto next;
3665                 }
3666
3667                 /*
3668                  * Async discard bitmap trimming begins at by setting the start
3669                  * to be key.objectid and the offset_to_bitmap() aligns to the
3670                  * start of the bitmap.  This lets us know we are fully
3671                  * scanning the bitmap rather than only some portion of it.
3672                  */
3673                 if (start == offset)
3674                         entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3675
3676                 bytes = minlen;
3677                 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3678                 if (ret2 || start >= end) {
3679                         /*
3680                          * We lossily consider a bitmap trimmed if we only skip
3681                          * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3682                          */
3683                         if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3684                                 end_trimming_bitmap(ctl, entry);
3685                         else
3686                                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3687                         spin_unlock(&ctl->tree_lock);
3688                         mutex_unlock(&ctl->cache_writeout_mutex);
3689                         next_bitmap = true;
3690                         goto next;
3691                 }
3692
3693                 /*
3694                  * We already trimmed a region, but are using the locking above
3695                  * to reset the trim_state.
3696                  */
3697                 if (async && *total_trimmed) {
3698                         spin_unlock(&ctl->tree_lock);
3699                         mutex_unlock(&ctl->cache_writeout_mutex);
3700                         goto out;
3701                 }
3702
3703                 bytes = min(bytes, end - start);
3704                 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3705                         spin_unlock(&ctl->tree_lock);
3706                         mutex_unlock(&ctl->cache_writeout_mutex);
3707                         goto next;
3708                 }
3709
3710                 /*
3711                  * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3712                  * If X < @minlen, we won't trim X when we come back around.
3713                  * So trim it now.  We differ here from trimming extents as we
3714                  * don't keep individual state per bit.
3715                  */
3716                 if (async &&
3717                     max_discard_size &&
3718                     bytes > (max_discard_size + minlen))
3719                         bytes = max_discard_size;
3720
3721                 bitmap_clear_bits(ctl, entry, start, bytes);
3722                 if (entry->bytes == 0)
3723                         free_bitmap(ctl, entry);
3724
3725                 spin_unlock(&ctl->tree_lock);
3726                 trim_entry.start = start;
3727                 trim_entry.bytes = bytes;
3728                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3729                 mutex_unlock(&ctl->cache_writeout_mutex);
3730
3731                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3732                                   start, bytes, 0, &trim_entry);
3733                 if (ret) {
3734                         reset_trimming_bitmap(ctl, offset);
3735                         block_group->discard_cursor =
3736                                 btrfs_block_group_end(block_group);
3737                         break;
3738                 }
3739 next:
3740                 if (next_bitmap) {
3741                         offset += BITS_PER_BITMAP * ctl->unit;
3742                         start = offset;
3743                 } else {
3744                         start += bytes;
3745                 }
3746                 block_group->discard_cursor = start;
3747
3748                 if (fatal_signal_pending(current)) {
3749                         if (start != offset)
3750                                 reset_trimming_bitmap(ctl, offset);
3751                         ret = -ERESTARTSYS;
3752                         break;
3753                 }
3754
3755                 cond_resched();
3756         }
3757
3758         if (offset >= end)
3759                 block_group->discard_cursor = end;
3760
3761 out:
3762         return ret;
3763 }
3764
3765 void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
3766 {
3767         atomic_inc(&cache->trimming);
3768 }
3769
3770 void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
3771 {
3772         struct btrfs_fs_info *fs_info = block_group->fs_info;
3773         struct extent_map_tree *em_tree;
3774         struct extent_map *em;
3775         bool cleanup;
3776
3777         spin_lock(&block_group->lock);
3778         cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3779                    block_group->removed);
3780         spin_unlock(&block_group->lock);
3781
3782         if (cleanup) {
3783                 mutex_lock(&fs_info->chunk_mutex);
3784                 em_tree = &fs_info->mapping_tree;
3785                 write_lock(&em_tree->lock);
3786                 em = lookup_extent_mapping(em_tree, block_group->start,
3787                                            1);
3788                 BUG_ON(!em); /* logic error, can't happen */
3789                 remove_extent_mapping(em_tree, em);
3790                 write_unlock(&em_tree->lock);
3791                 mutex_unlock(&fs_info->chunk_mutex);
3792
3793                 /* once for us and once for the tree */
3794                 free_extent_map(em);
3795                 free_extent_map(em);
3796
3797                 /*
3798                  * We've left one free space entry and other tasks trimming
3799                  * this block group have left 1 entry each one. Free them.
3800                  */
3801                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3802         }
3803 }
3804
3805 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3806                            u64 *trimmed, u64 start, u64 end, u64 minlen)
3807 {
3808         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3809         int ret;
3810         u64 rem = 0;
3811
3812         *trimmed = 0;
3813
3814         spin_lock(&block_group->lock);
3815         if (block_group->removed) {
3816                 spin_unlock(&block_group->lock);
3817                 return 0;
3818         }
3819         btrfs_get_block_group_trimming(block_group);
3820         spin_unlock(&block_group->lock);
3821
3822         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3823         if (ret)
3824                 goto out;
3825
3826         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3827         div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3828         /* If we ended in the middle of a bitmap, reset the trimming flag */
3829         if (rem)
3830                 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3831 out:
3832         btrfs_put_block_group_trimming(block_group);
3833         return ret;
3834 }
3835
3836 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
3837                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
3838                                    bool async)
3839 {
3840         int ret;
3841
3842         *trimmed = 0;
3843
3844         spin_lock(&block_group->lock);
3845         if (block_group->removed) {
3846                 spin_unlock(&block_group->lock);
3847                 return 0;
3848         }
3849         btrfs_get_block_group_trimming(block_group);
3850         spin_unlock(&block_group->lock);
3851
3852         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3853         btrfs_put_block_group_trimming(block_group);
3854
3855         return ret;
3856 }
3857
3858 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
3859                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
3860                                    u64 maxlen, bool async)
3861 {
3862         int ret;
3863
3864         *trimmed = 0;
3865
3866         spin_lock(&block_group->lock);
3867         if (block_group->removed) {
3868                 spin_unlock(&block_group->lock);
3869                 return 0;
3870         }
3871         btrfs_get_block_group_trimming(block_group);
3872         spin_unlock(&block_group->lock);
3873
3874         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
3875                            async);
3876
3877         btrfs_put_block_group_trimming(block_group);
3878
3879         return ret;
3880 }
3881
3882 /*
3883  * Find the left-most item in the cache tree, and then return the
3884  * smallest inode number in the item.
3885  *
3886  * Note: the returned inode number may not be the smallest one in
3887  * the tree, if the left-most item is a bitmap.
3888  */
3889 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3890 {
3891         struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3892         struct btrfs_free_space *entry = NULL;
3893         u64 ino = 0;
3894
3895         spin_lock(&ctl->tree_lock);
3896
3897         if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3898                 goto out;
3899
3900         entry = rb_entry(rb_first(&ctl->free_space_offset),
3901                          struct btrfs_free_space, offset_index);
3902
3903         if (!entry->bitmap) {
3904                 ino = entry->offset;
3905
3906                 unlink_free_space(ctl, entry);
3907                 entry->offset++;
3908                 entry->bytes--;
3909                 if (!entry->bytes)
3910                         kmem_cache_free(btrfs_free_space_cachep, entry);
3911                 else
3912                         link_free_space(ctl, entry);
3913         } else {
3914                 u64 offset = 0;
3915                 u64 count = 1;
3916                 int ret;
3917
3918                 ret = search_bitmap(ctl, entry, &offset, &count, true);
3919                 /* Logic error; Should be empty if it can't find anything */
3920                 ASSERT(!ret);
3921
3922                 ino = offset;
3923                 bitmap_clear_bits(ctl, entry, offset, 1);
3924                 if (entry->bytes == 0)
3925                         free_bitmap(ctl, entry);
3926         }
3927 out:
3928         spin_unlock(&ctl->tree_lock);
3929
3930         return ino;
3931 }
3932
3933 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3934                                     struct btrfs_path *path)
3935 {
3936         struct inode *inode = NULL;
3937
3938         spin_lock(&root->ino_cache_lock);
3939         if (root->ino_cache_inode)
3940                 inode = igrab(root->ino_cache_inode);
3941         spin_unlock(&root->ino_cache_lock);
3942         if (inode)
3943                 return inode;
3944
3945         inode = __lookup_free_space_inode(root, path, 0);
3946         if (IS_ERR(inode))
3947                 return inode;
3948
3949         spin_lock(&root->ino_cache_lock);
3950         if (!btrfs_fs_closing(root->fs_info))
3951                 root->ino_cache_inode = igrab(inode);
3952         spin_unlock(&root->ino_cache_lock);
3953
3954         return inode;
3955 }
3956
3957 int create_free_ino_inode(struct btrfs_root *root,
3958                           struct btrfs_trans_handle *trans,
3959                           struct btrfs_path *path)
3960 {
3961         return __create_free_space_inode(root, trans, path,
3962                                          BTRFS_FREE_INO_OBJECTID, 0);
3963 }
3964
3965 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3966 {
3967         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3968         struct btrfs_path *path;
3969         struct inode *inode;
3970         int ret = 0;
3971         u64 root_gen = btrfs_root_generation(&root->root_item);
3972
3973         if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3974                 return 0;
3975
3976         /*
3977          * If we're unmounting then just return, since this does a search on the
3978          * normal root and not the commit root and we could deadlock.
3979          */
3980         if (btrfs_fs_closing(fs_info))
3981                 return 0;
3982
3983         path = btrfs_alloc_path();
3984         if (!path)
3985                 return 0;
3986
3987         inode = lookup_free_ino_inode(root, path);
3988         if (IS_ERR(inode))
3989                 goto out;
3990
3991         if (root_gen != BTRFS_I(inode)->generation)
3992                 goto out_put;
3993
3994         ret = __load_free_space_cache(root, inode, ctl, path, 0);
3995
3996         if (ret < 0)
3997                 btrfs_err(fs_info,
3998                         "failed to load free ino cache for root %llu",
3999                         root->root_key.objectid);
4000 out_put:
4001         iput(inode);
4002 out:
4003         btrfs_free_path(path);
4004         return ret;
4005 }
4006
4007 int btrfs_write_out_ino_cache(struct btrfs_root *root,
4008                               struct btrfs_trans_handle *trans,
4009                               struct btrfs_path *path,
4010                               struct inode *inode)
4011 {
4012         struct btrfs_fs_info *fs_info = root->fs_info;
4013         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
4014         int ret;
4015         struct btrfs_io_ctl io_ctl;
4016         bool release_metadata = true;
4017
4018         if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
4019                 return 0;
4020
4021         memset(&io_ctl, 0, sizeof(io_ctl));
4022         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
4023         if (!ret) {
4024                 /*
4025                  * At this point writepages() didn't error out, so our metadata
4026                  * reservation is released when the writeback finishes, at
4027                  * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
4028                  * with or without an error.
4029                  */
4030                 release_metadata = false;
4031                 ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
4032         }
4033
4034         if (ret) {
4035                 if (release_metadata)
4036                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
4037                                         inode->i_size, true);
4038 #ifdef CONFIG_BTRFS_DEBUG
4039                 btrfs_err(fs_info,
4040                           "failed to write free ino cache for root %llu",
4041                           root->root_key.objectid);
4042 #endif
4043         }
4044
4045         return ret;
4046 }
4047
4048 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4049 /*
4050  * Use this if you need to make a bitmap or extent entry specifically, it
4051  * doesn't do any of the merging that add_free_space does, this acts a lot like
4052  * how the free space cache loading stuff works, so you can get really weird
4053  * configurations.
4054  */
4055 int test_add_free_space_entry(struct btrfs_block_group *cache,
4056                               u64 offset, u64 bytes, bool bitmap)
4057 {
4058         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4059         struct btrfs_free_space *info = NULL, *bitmap_info;
4060         void *map = NULL;
4061         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4062         u64 bytes_added;
4063         int ret;
4064
4065 again:
4066         if (!info) {
4067                 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4068                 if (!info)
4069                         return -ENOMEM;
4070         }
4071
4072         if (!bitmap) {
4073                 spin_lock(&ctl->tree_lock);
4074                 info->offset = offset;
4075                 info->bytes = bytes;
4076                 info->max_extent_size = 0;
4077                 ret = link_free_space(ctl, info);
4078                 spin_unlock(&ctl->tree_lock);
4079                 if (ret)
4080                         kmem_cache_free(btrfs_free_space_cachep, info);
4081                 return ret;
4082         }
4083
4084         if (!map) {
4085                 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4086                 if (!map) {
4087                         kmem_cache_free(btrfs_free_space_cachep, info);
4088                         return -ENOMEM;
4089                 }
4090         }
4091
4092         spin_lock(&ctl->tree_lock);
4093         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4094                                          1, 0);
4095         if (!bitmap_info) {
4096                 info->bitmap = map;
4097                 map = NULL;
4098                 add_new_bitmap(ctl, info, offset);
4099                 bitmap_info = info;
4100                 info = NULL;
4101         }
4102
4103         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4104                                           trim_state);
4105
4106         bytes -= bytes_added;
4107         offset += bytes_added;
4108         spin_unlock(&ctl->tree_lock);
4109
4110         if (bytes)
4111                 goto again;
4112
4113         if (info)
4114                 kmem_cache_free(btrfs_free_space_cachep, info);
4115         if (map)
4116                 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4117         return 0;
4118 }
4119
4120 /*
4121  * Checks to see if the given range is in the free space cache.  This is really
4122  * just used to check the absence of space, so if there is free space in the
4123  * range at all we will return 1.
4124  */
4125 int test_check_exists(struct btrfs_block_group *cache,
4126                       u64 offset, u64 bytes)
4127 {
4128         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4129         struct btrfs_free_space *info;
4130         int ret = 0;
4131
4132         spin_lock(&ctl->tree_lock);
4133         info = tree_search_offset(ctl, offset, 0, 0);
4134         if (!info) {
4135                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4136                                           1, 0);
4137                 if (!info)
4138                         goto out;
4139         }
4140
4141 have_info:
4142         if (info->bitmap) {
4143                 u64 bit_off, bit_bytes;
4144                 struct rb_node *n;
4145                 struct btrfs_free_space *tmp;
4146
4147                 bit_off = offset;
4148                 bit_bytes = ctl->unit;
4149                 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4150                 if (!ret) {
4151                         if (bit_off == offset) {
4152                                 ret = 1;
4153                                 goto out;
4154                         } else if (bit_off > offset &&
4155                                    offset + bytes > bit_off) {
4156                                 ret = 1;
4157                                 goto out;
4158                         }
4159                 }
4160
4161                 n = rb_prev(&info->offset_index);
4162                 while (n) {
4163                         tmp = rb_entry(n, struct btrfs_free_space,
4164                                        offset_index);
4165                         if (tmp->offset + tmp->bytes < offset)
4166                                 break;
4167                         if (offset + bytes < tmp->offset) {
4168                                 n = rb_prev(&tmp->offset_index);
4169                                 continue;
4170                         }
4171                         info = tmp;
4172                         goto have_info;
4173                 }
4174
4175                 n = rb_next(&info->offset_index);
4176                 while (n) {
4177                         tmp = rb_entry(n, struct btrfs_free_space,
4178                                        offset_index);
4179                         if (offset + bytes < tmp->offset)
4180                                 break;
4181                         if (tmp->offset + tmp->bytes < offset) {
4182                                 n = rb_next(&tmp->offset_index);
4183                                 continue;
4184                         }
4185                         info = tmp;
4186                         goto have_info;
4187                 }
4188
4189                 ret = 0;
4190                 goto out;
4191         }
4192
4193         if (info->offset == offset) {
4194                 ret = 1;
4195                 goto out;
4196         }
4197
4198         if (offset > info->offset && offset < info->offset + info->bytes)
4199                 ret = 1;
4200 out:
4201         spin_unlock(&ctl->tree_lock);
4202         return ret;
4203 }
4204 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */