fe4b2176a6e8fb7e48da86c66fdbe7347c2ee6ed
[linux-2.6-microblaze.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "free-space-cache.h"
16 #include "transaction.h"
17 #include "disk-io.h"
18 #include "extent_io.h"
19 #include "volumes.h"
20 #include "space-info.h"
21 #include "delalloc-space.h"
22 #include "block-group.h"
23 #include "discard.h"
24
25 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
26 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
27 #define FORCE_EXTENT_THRESHOLD  SZ_1M
28
29 struct btrfs_trim_range {
30         u64 start;
31         u64 bytes;
32         struct list_head list;
33 };
34
35 static int link_free_space(struct btrfs_free_space_ctl *ctl,
36                            struct btrfs_free_space *info);
37 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
38                               struct btrfs_free_space *info);
39 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
40                          struct btrfs_free_space *bitmap_info, u64 *offset,
41                          u64 *bytes, bool for_alloc);
42 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
43                         struct btrfs_free_space *bitmap_info);
44 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
45                               struct btrfs_free_space *info, u64 offset,
46                               u64 bytes);
47
48 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
49                                                struct btrfs_path *path,
50                                                u64 offset)
51 {
52         struct btrfs_fs_info *fs_info = root->fs_info;
53         struct btrfs_key key;
54         struct btrfs_key location;
55         struct btrfs_disk_key disk_key;
56         struct btrfs_free_space_header *header;
57         struct extent_buffer *leaf;
58         struct inode *inode = NULL;
59         unsigned nofs_flag;
60         int ret;
61
62         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
63         key.offset = offset;
64         key.type = 0;
65
66         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
67         if (ret < 0)
68                 return ERR_PTR(ret);
69         if (ret > 0) {
70                 btrfs_release_path(path);
71                 return ERR_PTR(-ENOENT);
72         }
73
74         leaf = path->nodes[0];
75         header = btrfs_item_ptr(leaf, path->slots[0],
76                                 struct btrfs_free_space_header);
77         btrfs_free_space_key(leaf, header, &disk_key);
78         btrfs_disk_key_to_cpu(&location, &disk_key);
79         btrfs_release_path(path);
80
81         /*
82          * We are often under a trans handle at this point, so we need to make
83          * sure NOFS is set to keep us from deadlocking.
84          */
85         nofs_flag = memalloc_nofs_save();
86         inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
87         btrfs_release_path(path);
88         memalloc_nofs_restore(nofs_flag);
89         if (IS_ERR(inode))
90                 return inode;
91
92         mapping_set_gfp_mask(inode->i_mapping,
93                         mapping_gfp_constraint(inode->i_mapping,
94                         ~(__GFP_FS | __GFP_HIGHMEM)));
95
96         return inode;
97 }
98
99 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
100                 struct btrfs_path *path)
101 {
102         struct btrfs_fs_info *fs_info = block_group->fs_info;
103         struct inode *inode = NULL;
104         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
105
106         spin_lock(&block_group->lock);
107         if (block_group->inode)
108                 inode = igrab(block_group->inode);
109         spin_unlock(&block_group->lock);
110         if (inode)
111                 return inode;
112
113         inode = __lookup_free_space_inode(fs_info->tree_root, path,
114                                           block_group->start);
115         if (IS_ERR(inode))
116                 return inode;
117
118         spin_lock(&block_group->lock);
119         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
120                 btrfs_info(fs_info, "Old style space inode found, converting.");
121                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
122                         BTRFS_INODE_NODATACOW;
123                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
124         }
125
126         if (!block_group->iref) {
127                 block_group->inode = igrab(inode);
128                 block_group->iref = 1;
129         }
130         spin_unlock(&block_group->lock);
131
132         return inode;
133 }
134
135 static int __create_free_space_inode(struct btrfs_root *root,
136                                      struct btrfs_trans_handle *trans,
137                                      struct btrfs_path *path,
138                                      u64 ino, u64 offset)
139 {
140         struct btrfs_key key;
141         struct btrfs_disk_key disk_key;
142         struct btrfs_free_space_header *header;
143         struct btrfs_inode_item *inode_item;
144         struct extent_buffer *leaf;
145         /* We inline CRCs for the free disk space cache */
146         const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
147                           BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
148         int ret;
149
150         ret = btrfs_insert_empty_inode(trans, root, path, ino);
151         if (ret)
152                 return ret;
153
154         leaf = path->nodes[0];
155         inode_item = btrfs_item_ptr(leaf, path->slots[0],
156                                     struct btrfs_inode_item);
157         btrfs_item_key(leaf, &disk_key, path->slots[0]);
158         memzero_extent_buffer(leaf, (unsigned long)inode_item,
159                              sizeof(*inode_item));
160         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
161         btrfs_set_inode_size(leaf, inode_item, 0);
162         btrfs_set_inode_nbytes(leaf, inode_item, 0);
163         btrfs_set_inode_uid(leaf, inode_item, 0);
164         btrfs_set_inode_gid(leaf, inode_item, 0);
165         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
166         btrfs_set_inode_flags(leaf, inode_item, flags);
167         btrfs_set_inode_nlink(leaf, inode_item, 1);
168         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
169         btrfs_set_inode_block_group(leaf, inode_item, offset);
170         btrfs_mark_buffer_dirty(leaf);
171         btrfs_release_path(path);
172
173         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
174         key.offset = offset;
175         key.type = 0;
176         ret = btrfs_insert_empty_item(trans, root, path, &key,
177                                       sizeof(struct btrfs_free_space_header));
178         if (ret < 0) {
179                 btrfs_release_path(path);
180                 return ret;
181         }
182
183         leaf = path->nodes[0];
184         header = btrfs_item_ptr(leaf, path->slots[0],
185                                 struct btrfs_free_space_header);
186         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
187         btrfs_set_free_space_key(leaf, header, &disk_key);
188         btrfs_mark_buffer_dirty(leaf);
189         btrfs_release_path(path);
190
191         return 0;
192 }
193
194 int create_free_space_inode(struct btrfs_trans_handle *trans,
195                             struct btrfs_block_group *block_group,
196                             struct btrfs_path *path)
197 {
198         int ret;
199         u64 ino;
200
201         ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
202         if (ret < 0)
203                 return ret;
204
205         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
206                                          ino, block_group->start);
207 }
208
209 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
210                                        struct btrfs_block_rsv *rsv)
211 {
212         u64 needed_bytes;
213         int ret;
214
215         /* 1 for slack space, 1 for updating the inode */
216         needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
217                 btrfs_calc_metadata_size(fs_info, 1);
218
219         spin_lock(&rsv->lock);
220         if (rsv->reserved < needed_bytes)
221                 ret = -ENOSPC;
222         else
223                 ret = 0;
224         spin_unlock(&rsv->lock);
225         return ret;
226 }
227
228 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
229                                     struct btrfs_block_group *block_group,
230                                     struct inode *inode)
231 {
232         struct btrfs_root *root = BTRFS_I(inode)->root;
233         int ret = 0;
234         bool locked = false;
235
236         if (block_group) {
237                 struct btrfs_path *path = btrfs_alloc_path();
238
239                 if (!path) {
240                         ret = -ENOMEM;
241                         goto fail;
242                 }
243                 locked = true;
244                 mutex_lock(&trans->transaction->cache_write_mutex);
245                 if (!list_empty(&block_group->io_list)) {
246                         list_del_init(&block_group->io_list);
247
248                         btrfs_wait_cache_io(trans, block_group, path);
249                         btrfs_put_block_group(block_group);
250                 }
251
252                 /*
253                  * now that we've truncated the cache away, its no longer
254                  * setup or written
255                  */
256                 spin_lock(&block_group->lock);
257                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
258                 spin_unlock(&block_group->lock);
259                 btrfs_free_path(path);
260         }
261
262         btrfs_i_size_write(BTRFS_I(inode), 0);
263         truncate_pagecache(inode, 0);
264
265         /*
266          * We skip the throttling logic for free space cache inodes, so we don't
267          * need to check for -EAGAIN.
268          */
269         ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
270                                          0, BTRFS_EXTENT_DATA_KEY);
271         if (ret)
272                 goto fail;
273
274         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
275
276 fail:
277         if (locked)
278                 mutex_unlock(&trans->transaction->cache_write_mutex);
279         if (ret)
280                 btrfs_abort_transaction(trans, ret);
281
282         return ret;
283 }
284
285 static void readahead_cache(struct inode *inode)
286 {
287         struct file_ra_state *ra;
288         unsigned long last_index;
289
290         ra = kzalloc(sizeof(*ra), GFP_NOFS);
291         if (!ra)
292                 return;
293
294         file_ra_state_init(ra, inode->i_mapping);
295         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
296
297         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
298
299         kfree(ra);
300 }
301
302 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
303                        int write)
304 {
305         int num_pages;
306
307         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
308
309         /* Make sure we can fit our crcs and generation into the first page */
310         if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
311                 return -ENOSPC;
312
313         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
314
315         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
316         if (!io_ctl->pages)
317                 return -ENOMEM;
318
319         io_ctl->num_pages = num_pages;
320         io_ctl->fs_info = btrfs_sb(inode->i_sb);
321         io_ctl->inode = inode;
322
323         return 0;
324 }
325 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
326
327 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
328 {
329         kfree(io_ctl->pages);
330         io_ctl->pages = NULL;
331 }
332
333 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
334 {
335         if (io_ctl->cur) {
336                 io_ctl->cur = NULL;
337                 io_ctl->orig = NULL;
338         }
339 }
340
341 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
342 {
343         ASSERT(io_ctl->index < io_ctl->num_pages);
344         io_ctl->page = io_ctl->pages[io_ctl->index++];
345         io_ctl->cur = page_address(io_ctl->page);
346         io_ctl->orig = io_ctl->cur;
347         io_ctl->size = PAGE_SIZE;
348         if (clear)
349                 clear_page(io_ctl->cur);
350 }
351
352 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
353 {
354         int i;
355
356         io_ctl_unmap_page(io_ctl);
357
358         for (i = 0; i < io_ctl->num_pages; i++) {
359                 if (io_ctl->pages[i]) {
360                         ClearPageChecked(io_ctl->pages[i]);
361                         unlock_page(io_ctl->pages[i]);
362                         put_page(io_ctl->pages[i]);
363                 }
364         }
365 }
366
367 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
368 {
369         struct page *page;
370         struct inode *inode = io_ctl->inode;
371         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
372         int i;
373
374         for (i = 0; i < io_ctl->num_pages; i++) {
375                 page = find_or_create_page(inode->i_mapping, i, mask);
376                 if (!page) {
377                         io_ctl_drop_pages(io_ctl);
378                         return -ENOMEM;
379                 }
380                 io_ctl->pages[i] = page;
381                 if (uptodate && !PageUptodate(page)) {
382                         btrfs_readpage(NULL, page);
383                         lock_page(page);
384                         if (page->mapping != inode->i_mapping) {
385                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
386                                           "free space cache page truncated");
387                                 io_ctl_drop_pages(io_ctl);
388                                 return -EIO;
389                         }
390                         if (!PageUptodate(page)) {
391                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
392                                            "error reading free space cache");
393                                 io_ctl_drop_pages(io_ctl);
394                                 return -EIO;
395                         }
396                 }
397         }
398
399         for (i = 0; i < io_ctl->num_pages; i++) {
400                 clear_page_dirty_for_io(io_ctl->pages[i]);
401                 set_page_extent_mapped(io_ctl->pages[i]);
402         }
403
404         return 0;
405 }
406
407 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
408 {
409         io_ctl_map_page(io_ctl, 1);
410
411         /*
412          * Skip the csum areas.  If we don't check crcs then we just have a
413          * 64bit chunk at the front of the first page.
414          */
415         io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
416         io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
417
418         put_unaligned_le64(generation, io_ctl->cur);
419         io_ctl->cur += sizeof(u64);
420 }
421
422 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
423 {
424         u64 cache_gen;
425
426         /*
427          * Skip the crc area.  If we don't check crcs then we just have a 64bit
428          * chunk at the front of the first page.
429          */
430         io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
431         io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
432
433         cache_gen = get_unaligned_le64(io_ctl->cur);
434         if (cache_gen != generation) {
435                 btrfs_err_rl(io_ctl->fs_info,
436                         "space cache generation (%llu) does not match inode (%llu)",
437                                 cache_gen, generation);
438                 io_ctl_unmap_page(io_ctl);
439                 return -EIO;
440         }
441         io_ctl->cur += sizeof(u64);
442         return 0;
443 }
444
445 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
446 {
447         u32 *tmp;
448         u32 crc = ~(u32)0;
449         unsigned offset = 0;
450
451         if (index == 0)
452                 offset = sizeof(u32) * io_ctl->num_pages;
453
454         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
455         btrfs_crc32c_final(crc, (u8 *)&crc);
456         io_ctl_unmap_page(io_ctl);
457         tmp = page_address(io_ctl->pages[0]);
458         tmp += index;
459         *tmp = crc;
460 }
461
462 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
463 {
464         u32 *tmp, val;
465         u32 crc = ~(u32)0;
466         unsigned offset = 0;
467
468         if (index == 0)
469                 offset = sizeof(u32) * io_ctl->num_pages;
470
471         tmp = page_address(io_ctl->pages[0]);
472         tmp += index;
473         val = *tmp;
474
475         io_ctl_map_page(io_ctl, 0);
476         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
477         btrfs_crc32c_final(crc, (u8 *)&crc);
478         if (val != crc) {
479                 btrfs_err_rl(io_ctl->fs_info,
480                         "csum mismatch on free space cache");
481                 io_ctl_unmap_page(io_ctl);
482                 return -EIO;
483         }
484
485         return 0;
486 }
487
488 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
489                             void *bitmap)
490 {
491         struct btrfs_free_space_entry *entry;
492
493         if (!io_ctl->cur)
494                 return -ENOSPC;
495
496         entry = io_ctl->cur;
497         put_unaligned_le64(offset, &entry->offset);
498         put_unaligned_le64(bytes, &entry->bytes);
499         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
500                 BTRFS_FREE_SPACE_EXTENT;
501         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
502         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
503
504         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
505                 return 0;
506
507         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
508
509         /* No more pages to map */
510         if (io_ctl->index >= io_ctl->num_pages)
511                 return 0;
512
513         /* map the next page */
514         io_ctl_map_page(io_ctl, 1);
515         return 0;
516 }
517
518 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
519 {
520         if (!io_ctl->cur)
521                 return -ENOSPC;
522
523         /*
524          * If we aren't at the start of the current page, unmap this one and
525          * map the next one if there is any left.
526          */
527         if (io_ctl->cur != io_ctl->orig) {
528                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
529                 if (io_ctl->index >= io_ctl->num_pages)
530                         return -ENOSPC;
531                 io_ctl_map_page(io_ctl, 0);
532         }
533
534         copy_page(io_ctl->cur, bitmap);
535         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
536         if (io_ctl->index < io_ctl->num_pages)
537                 io_ctl_map_page(io_ctl, 0);
538         return 0;
539 }
540
541 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
542 {
543         /*
544          * If we're not on the boundary we know we've modified the page and we
545          * need to crc the page.
546          */
547         if (io_ctl->cur != io_ctl->orig)
548                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
549         else
550                 io_ctl_unmap_page(io_ctl);
551
552         while (io_ctl->index < io_ctl->num_pages) {
553                 io_ctl_map_page(io_ctl, 1);
554                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
555         }
556 }
557
558 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
559                             struct btrfs_free_space *entry, u8 *type)
560 {
561         struct btrfs_free_space_entry *e;
562         int ret;
563
564         if (!io_ctl->cur) {
565                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
566                 if (ret)
567                         return ret;
568         }
569
570         e = io_ctl->cur;
571         entry->offset = get_unaligned_le64(&e->offset);
572         entry->bytes = get_unaligned_le64(&e->bytes);
573         *type = e->type;
574         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
575         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
576
577         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
578                 return 0;
579
580         io_ctl_unmap_page(io_ctl);
581
582         return 0;
583 }
584
585 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
586                               struct btrfs_free_space *entry)
587 {
588         int ret;
589
590         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
591         if (ret)
592                 return ret;
593
594         copy_page(entry->bitmap, io_ctl->cur);
595         io_ctl_unmap_page(io_ctl);
596
597         return 0;
598 }
599
600 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
601 {
602         struct btrfs_block_group *block_group = ctl->private;
603         u64 max_bytes;
604         u64 bitmap_bytes;
605         u64 extent_bytes;
606         u64 size = block_group->length;
607         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
608         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
609
610         max_bitmaps = max_t(u64, max_bitmaps, 1);
611
612         ASSERT(ctl->total_bitmaps <= max_bitmaps);
613
614         /*
615          * We are trying to keep the total amount of memory used per 1GiB of
616          * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
617          * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
618          * bitmaps, we may end up using more memory than this.
619          */
620         if (size < SZ_1G)
621                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
622         else
623                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
624
625         bitmap_bytes = ctl->total_bitmaps * ctl->unit;
626
627         /*
628          * we want the extent entry threshold to always be at most 1/2 the max
629          * bytes we can have, or whatever is less than that.
630          */
631         extent_bytes = max_bytes - bitmap_bytes;
632         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
633
634         ctl->extents_thresh =
635                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
636 }
637
638 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
639                                    struct btrfs_free_space_ctl *ctl,
640                                    struct btrfs_path *path, u64 offset)
641 {
642         struct btrfs_fs_info *fs_info = root->fs_info;
643         struct btrfs_free_space_header *header;
644         struct extent_buffer *leaf;
645         struct btrfs_io_ctl io_ctl;
646         struct btrfs_key key;
647         struct btrfs_free_space *e, *n;
648         LIST_HEAD(bitmaps);
649         u64 num_entries;
650         u64 num_bitmaps;
651         u64 generation;
652         u8 type;
653         int ret = 0;
654
655         /* Nothing in the space cache, goodbye */
656         if (!i_size_read(inode))
657                 return 0;
658
659         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
660         key.offset = offset;
661         key.type = 0;
662
663         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
664         if (ret < 0)
665                 return 0;
666         else if (ret > 0) {
667                 btrfs_release_path(path);
668                 return 0;
669         }
670
671         ret = -1;
672
673         leaf = path->nodes[0];
674         header = btrfs_item_ptr(leaf, path->slots[0],
675                                 struct btrfs_free_space_header);
676         num_entries = btrfs_free_space_entries(leaf, header);
677         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
678         generation = btrfs_free_space_generation(leaf, header);
679         btrfs_release_path(path);
680
681         if (!BTRFS_I(inode)->generation) {
682                 btrfs_info(fs_info,
683                            "the free space cache file (%llu) is invalid, skip it",
684                            offset);
685                 return 0;
686         }
687
688         if (BTRFS_I(inode)->generation != generation) {
689                 btrfs_err(fs_info,
690                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
691                           BTRFS_I(inode)->generation, generation);
692                 return 0;
693         }
694
695         if (!num_entries)
696                 return 0;
697
698         ret = io_ctl_init(&io_ctl, inode, 0);
699         if (ret)
700                 return ret;
701
702         readahead_cache(inode);
703
704         ret = io_ctl_prepare_pages(&io_ctl, true);
705         if (ret)
706                 goto out;
707
708         ret = io_ctl_check_crc(&io_ctl, 0);
709         if (ret)
710                 goto free_cache;
711
712         ret = io_ctl_check_generation(&io_ctl, generation);
713         if (ret)
714                 goto free_cache;
715
716         while (num_entries) {
717                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
718                                       GFP_NOFS);
719                 if (!e)
720                         goto free_cache;
721
722                 ret = io_ctl_read_entry(&io_ctl, e, &type);
723                 if (ret) {
724                         kmem_cache_free(btrfs_free_space_cachep, e);
725                         goto free_cache;
726                 }
727
728                 if (!e->bytes) {
729                         kmem_cache_free(btrfs_free_space_cachep, e);
730                         goto free_cache;
731                 }
732
733                 if (type == BTRFS_FREE_SPACE_EXTENT) {
734                         spin_lock(&ctl->tree_lock);
735                         ret = link_free_space(ctl, e);
736                         spin_unlock(&ctl->tree_lock);
737                         if (ret) {
738                                 btrfs_err(fs_info,
739                                         "Duplicate entries in free space cache, dumping");
740                                 kmem_cache_free(btrfs_free_space_cachep, e);
741                                 goto free_cache;
742                         }
743                 } else {
744                         ASSERT(num_bitmaps);
745                         num_bitmaps--;
746                         e->bitmap = kmem_cache_zalloc(
747                                         btrfs_free_space_bitmap_cachep, GFP_NOFS);
748                         if (!e->bitmap) {
749                                 kmem_cache_free(
750                                         btrfs_free_space_cachep, e);
751                                 goto free_cache;
752                         }
753                         spin_lock(&ctl->tree_lock);
754                         ret = link_free_space(ctl, e);
755                         ctl->total_bitmaps++;
756                         recalculate_thresholds(ctl);
757                         spin_unlock(&ctl->tree_lock);
758                         if (ret) {
759                                 btrfs_err(fs_info,
760                                         "Duplicate entries in free space cache, dumping");
761                                 kmem_cache_free(btrfs_free_space_cachep, e);
762                                 goto free_cache;
763                         }
764                         list_add_tail(&e->list, &bitmaps);
765                 }
766
767                 num_entries--;
768         }
769
770         io_ctl_unmap_page(&io_ctl);
771
772         /*
773          * We add the bitmaps at the end of the entries in order that
774          * the bitmap entries are added to the cache.
775          */
776         list_for_each_entry_safe(e, n, &bitmaps, list) {
777                 list_del_init(&e->list);
778                 ret = io_ctl_read_bitmap(&io_ctl, e);
779                 if (ret)
780                         goto free_cache;
781         }
782
783         io_ctl_drop_pages(&io_ctl);
784         ret = 1;
785 out:
786         io_ctl_free(&io_ctl);
787         return ret;
788 free_cache:
789         io_ctl_drop_pages(&io_ctl);
790         __btrfs_remove_free_space_cache(ctl);
791         goto out;
792 }
793
794 static int copy_free_space_cache(struct btrfs_block_group *block_group,
795                                  struct btrfs_free_space_ctl *ctl)
796 {
797         struct btrfs_free_space *info;
798         struct rb_node *n;
799         int ret = 0;
800
801         while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
802                 info = rb_entry(n, struct btrfs_free_space, offset_index);
803                 if (!info->bitmap) {
804                         unlink_free_space(ctl, info);
805                         ret = btrfs_add_free_space(block_group, info->offset,
806                                                    info->bytes);
807                         kmem_cache_free(btrfs_free_space_cachep, info);
808                 } else {
809                         u64 offset = info->offset;
810                         u64 bytes = ctl->unit;
811
812                         while (search_bitmap(ctl, info, &offset, &bytes,
813                                              false) == 0) {
814                                 ret = btrfs_add_free_space(block_group, offset,
815                                                            bytes);
816                                 if (ret)
817                                         break;
818                                 bitmap_clear_bits(ctl, info, offset, bytes);
819                                 offset = info->offset;
820                                 bytes = ctl->unit;
821                         }
822                         free_bitmap(ctl, info);
823                 }
824                 cond_resched();
825         }
826         return ret;
827 }
828
829 int load_free_space_cache(struct btrfs_block_group *block_group)
830 {
831         struct btrfs_fs_info *fs_info = block_group->fs_info;
832         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
833         struct btrfs_free_space_ctl tmp_ctl = {};
834         struct inode *inode;
835         struct btrfs_path *path;
836         int ret = 0;
837         bool matched;
838         u64 used = block_group->used;
839
840         /*
841          * Because we could potentially discard our loaded free space, we want
842          * to load everything into a temporary structure first, and then if it's
843          * valid copy it all into the actual free space ctl.
844          */
845         btrfs_init_free_space_ctl(block_group, &tmp_ctl);
846
847         /*
848          * If this block group has been marked to be cleared for one reason or
849          * another then we can't trust the on disk cache, so just return.
850          */
851         spin_lock(&block_group->lock);
852         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
853                 spin_unlock(&block_group->lock);
854                 return 0;
855         }
856         spin_unlock(&block_group->lock);
857
858         path = btrfs_alloc_path();
859         if (!path)
860                 return 0;
861         path->search_commit_root = 1;
862         path->skip_locking = 1;
863
864         /*
865          * We must pass a path with search_commit_root set to btrfs_iget in
866          * order to avoid a deadlock when allocating extents for the tree root.
867          *
868          * When we are COWing an extent buffer from the tree root, when looking
869          * for a free extent, at extent-tree.c:find_free_extent(), we can find
870          * block group without its free space cache loaded. When we find one
871          * we must load its space cache which requires reading its free space
872          * cache's inode item from the root tree. If this inode item is located
873          * in the same leaf that we started COWing before, then we end up in
874          * deadlock on the extent buffer (trying to read lock it when we
875          * previously write locked it).
876          *
877          * It's safe to read the inode item using the commit root because
878          * block groups, once loaded, stay in memory forever (until they are
879          * removed) as well as their space caches once loaded. New block groups
880          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
881          * we will never try to read their inode item while the fs is mounted.
882          */
883         inode = lookup_free_space_inode(block_group, path);
884         if (IS_ERR(inode)) {
885                 btrfs_free_path(path);
886                 return 0;
887         }
888
889         /* We may have converted the inode and made the cache invalid. */
890         spin_lock(&block_group->lock);
891         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
892                 spin_unlock(&block_group->lock);
893                 btrfs_free_path(path);
894                 goto out;
895         }
896         spin_unlock(&block_group->lock);
897
898         ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
899                                       path, block_group->start);
900         btrfs_free_path(path);
901         if (ret <= 0)
902                 goto out;
903
904         matched = (tmp_ctl.free_space == (block_group->length - used -
905                                           block_group->bytes_super));
906
907         if (matched) {
908                 ret = copy_free_space_cache(block_group, &tmp_ctl);
909                 /*
910                  * ret == 1 means we successfully loaded the free space cache,
911                  * so we need to re-set it here.
912                  */
913                 if (ret == 0)
914                         ret = 1;
915         } else {
916                 __btrfs_remove_free_space_cache(&tmp_ctl);
917                 btrfs_warn(fs_info,
918                            "block group %llu has wrong amount of free space",
919                            block_group->start);
920                 ret = -1;
921         }
922 out:
923         if (ret < 0) {
924                 /* This cache is bogus, make sure it gets cleared */
925                 spin_lock(&block_group->lock);
926                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
927                 spin_unlock(&block_group->lock);
928                 ret = 0;
929
930                 btrfs_warn(fs_info,
931                            "failed to load free space cache for block group %llu, rebuilding it now",
932                            block_group->start);
933         }
934
935         spin_lock(&ctl->tree_lock);
936         btrfs_discard_update_discardable(block_group);
937         spin_unlock(&ctl->tree_lock);
938         iput(inode);
939         return ret;
940 }
941
942 static noinline_for_stack
943 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
944                               struct btrfs_free_space_ctl *ctl,
945                               struct btrfs_block_group *block_group,
946                               int *entries, int *bitmaps,
947                               struct list_head *bitmap_list)
948 {
949         int ret;
950         struct btrfs_free_cluster *cluster = NULL;
951         struct btrfs_free_cluster *cluster_locked = NULL;
952         struct rb_node *node = rb_first(&ctl->free_space_offset);
953         struct btrfs_trim_range *trim_entry;
954
955         /* Get the cluster for this block_group if it exists */
956         if (block_group && !list_empty(&block_group->cluster_list)) {
957                 cluster = list_entry(block_group->cluster_list.next,
958                                      struct btrfs_free_cluster,
959                                      block_group_list);
960         }
961
962         if (!node && cluster) {
963                 cluster_locked = cluster;
964                 spin_lock(&cluster_locked->lock);
965                 node = rb_first(&cluster->root);
966                 cluster = NULL;
967         }
968
969         /* Write out the extent entries */
970         while (node) {
971                 struct btrfs_free_space *e;
972
973                 e = rb_entry(node, struct btrfs_free_space, offset_index);
974                 *entries += 1;
975
976                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
977                                        e->bitmap);
978                 if (ret)
979                         goto fail;
980
981                 if (e->bitmap) {
982                         list_add_tail(&e->list, bitmap_list);
983                         *bitmaps += 1;
984                 }
985                 node = rb_next(node);
986                 if (!node && cluster) {
987                         node = rb_first(&cluster->root);
988                         cluster_locked = cluster;
989                         spin_lock(&cluster_locked->lock);
990                         cluster = NULL;
991                 }
992         }
993         if (cluster_locked) {
994                 spin_unlock(&cluster_locked->lock);
995                 cluster_locked = NULL;
996         }
997
998         /*
999          * Make sure we don't miss any range that was removed from our rbtree
1000          * because trimming is running. Otherwise after a umount+mount (or crash
1001          * after committing the transaction) we would leak free space and get
1002          * an inconsistent free space cache report from fsck.
1003          */
1004         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1005                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1006                                        trim_entry->bytes, NULL);
1007                 if (ret)
1008                         goto fail;
1009                 *entries += 1;
1010         }
1011
1012         return 0;
1013 fail:
1014         if (cluster_locked)
1015                 spin_unlock(&cluster_locked->lock);
1016         return -ENOSPC;
1017 }
1018
1019 static noinline_for_stack int
1020 update_cache_item(struct btrfs_trans_handle *trans,
1021                   struct btrfs_root *root,
1022                   struct inode *inode,
1023                   struct btrfs_path *path, u64 offset,
1024                   int entries, int bitmaps)
1025 {
1026         struct btrfs_key key;
1027         struct btrfs_free_space_header *header;
1028         struct extent_buffer *leaf;
1029         int ret;
1030
1031         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1032         key.offset = offset;
1033         key.type = 0;
1034
1035         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1036         if (ret < 0) {
1037                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1038                                  EXTENT_DELALLOC, 0, 0, NULL);
1039                 goto fail;
1040         }
1041         leaf = path->nodes[0];
1042         if (ret > 0) {
1043                 struct btrfs_key found_key;
1044                 ASSERT(path->slots[0]);
1045                 path->slots[0]--;
1046                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1047                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1048                     found_key.offset != offset) {
1049                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1050                                          inode->i_size - 1, EXTENT_DELALLOC, 0,
1051                                          0, NULL);
1052                         btrfs_release_path(path);
1053                         goto fail;
1054                 }
1055         }
1056
1057         BTRFS_I(inode)->generation = trans->transid;
1058         header = btrfs_item_ptr(leaf, path->slots[0],
1059                                 struct btrfs_free_space_header);
1060         btrfs_set_free_space_entries(leaf, header, entries);
1061         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1062         btrfs_set_free_space_generation(leaf, header, trans->transid);
1063         btrfs_mark_buffer_dirty(leaf);
1064         btrfs_release_path(path);
1065
1066         return 0;
1067
1068 fail:
1069         return -1;
1070 }
1071
1072 static noinline_for_stack int write_pinned_extent_entries(
1073                             struct btrfs_trans_handle *trans,
1074                             struct btrfs_block_group *block_group,
1075                             struct btrfs_io_ctl *io_ctl,
1076                             int *entries)
1077 {
1078         u64 start, extent_start, extent_end, len;
1079         struct extent_io_tree *unpin = NULL;
1080         int ret;
1081
1082         if (!block_group)
1083                 return 0;
1084
1085         /*
1086          * We want to add any pinned extents to our free space cache
1087          * so we don't leak the space
1088          *
1089          * We shouldn't have switched the pinned extents yet so this is the
1090          * right one
1091          */
1092         unpin = &trans->transaction->pinned_extents;
1093
1094         start = block_group->start;
1095
1096         while (start < block_group->start + block_group->length) {
1097                 ret = find_first_extent_bit(unpin, start,
1098                                             &extent_start, &extent_end,
1099                                             EXTENT_DIRTY, NULL);
1100                 if (ret)
1101                         return 0;
1102
1103                 /* This pinned extent is out of our range */
1104                 if (extent_start >= block_group->start + block_group->length)
1105                         return 0;
1106
1107                 extent_start = max(extent_start, start);
1108                 extent_end = min(block_group->start + block_group->length,
1109                                  extent_end + 1);
1110                 len = extent_end - extent_start;
1111
1112                 *entries += 1;
1113                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1114                 if (ret)
1115                         return -ENOSPC;
1116
1117                 start = extent_end;
1118         }
1119
1120         return 0;
1121 }
1122
1123 static noinline_for_stack int
1124 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1125 {
1126         struct btrfs_free_space *entry, *next;
1127         int ret;
1128
1129         /* Write out the bitmaps */
1130         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1131                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1132                 if (ret)
1133                         return -ENOSPC;
1134                 list_del_init(&entry->list);
1135         }
1136
1137         return 0;
1138 }
1139
1140 static int flush_dirty_cache(struct inode *inode)
1141 {
1142         int ret;
1143
1144         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1145         if (ret)
1146                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1147                                  EXTENT_DELALLOC, 0, 0, NULL);
1148
1149         return ret;
1150 }
1151
1152 static void noinline_for_stack
1153 cleanup_bitmap_list(struct list_head *bitmap_list)
1154 {
1155         struct btrfs_free_space *entry, *next;
1156
1157         list_for_each_entry_safe(entry, next, bitmap_list, list)
1158                 list_del_init(&entry->list);
1159 }
1160
1161 static void noinline_for_stack
1162 cleanup_write_cache_enospc(struct inode *inode,
1163                            struct btrfs_io_ctl *io_ctl,
1164                            struct extent_state **cached_state)
1165 {
1166         io_ctl_drop_pages(io_ctl);
1167         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1168                              i_size_read(inode) - 1, cached_state);
1169 }
1170
1171 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1172                                  struct btrfs_trans_handle *trans,
1173                                  struct btrfs_block_group *block_group,
1174                                  struct btrfs_io_ctl *io_ctl,
1175                                  struct btrfs_path *path, u64 offset)
1176 {
1177         int ret;
1178         struct inode *inode = io_ctl->inode;
1179
1180         if (!inode)
1181                 return 0;
1182
1183         /* Flush the dirty pages in the cache file. */
1184         ret = flush_dirty_cache(inode);
1185         if (ret)
1186                 goto out;
1187
1188         /* Update the cache item to tell everyone this cache file is valid. */
1189         ret = update_cache_item(trans, root, inode, path, offset,
1190                                 io_ctl->entries, io_ctl->bitmaps);
1191 out:
1192         if (ret) {
1193                 invalidate_inode_pages2(inode->i_mapping);
1194                 BTRFS_I(inode)->generation = 0;
1195                 if (block_group)
1196                         btrfs_debug(root->fs_info,
1197           "failed to write free space cache for block group %llu error %d",
1198                                   block_group->start, ret);
1199         }
1200         btrfs_update_inode(trans, root, BTRFS_I(inode));
1201
1202         if (block_group) {
1203                 /* the dirty list is protected by the dirty_bgs_lock */
1204                 spin_lock(&trans->transaction->dirty_bgs_lock);
1205
1206                 /* the disk_cache_state is protected by the block group lock */
1207                 spin_lock(&block_group->lock);
1208
1209                 /*
1210                  * only mark this as written if we didn't get put back on
1211                  * the dirty list while waiting for IO.   Otherwise our
1212                  * cache state won't be right, and we won't get written again
1213                  */
1214                 if (!ret && list_empty(&block_group->dirty_list))
1215                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1216                 else if (ret)
1217                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1218
1219                 spin_unlock(&block_group->lock);
1220                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1221                 io_ctl->inode = NULL;
1222                 iput(inode);
1223         }
1224
1225         return ret;
1226
1227 }
1228
1229 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1230                         struct btrfs_block_group *block_group,
1231                         struct btrfs_path *path)
1232 {
1233         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1234                                      block_group, &block_group->io_ctl,
1235                                      path, block_group->start);
1236 }
1237
1238 /**
1239  * __btrfs_write_out_cache - write out cached info to an inode
1240  * @root - the root the inode belongs to
1241  * @ctl - the free space cache we are going to write out
1242  * @block_group - the block_group for this cache if it belongs to a block_group
1243  * @trans - the trans handle
1244  *
1245  * This function writes out a free space cache struct to disk for quick recovery
1246  * on mount.  This will return 0 if it was successful in writing the cache out,
1247  * or an errno if it was not.
1248  */
1249 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1250                                    struct btrfs_free_space_ctl *ctl,
1251                                    struct btrfs_block_group *block_group,
1252                                    struct btrfs_io_ctl *io_ctl,
1253                                    struct btrfs_trans_handle *trans)
1254 {
1255         struct extent_state *cached_state = NULL;
1256         LIST_HEAD(bitmap_list);
1257         int entries = 0;
1258         int bitmaps = 0;
1259         int ret;
1260         int must_iput = 0;
1261
1262         if (!i_size_read(inode))
1263                 return -EIO;
1264
1265         WARN_ON(io_ctl->pages);
1266         ret = io_ctl_init(io_ctl, inode, 1);
1267         if (ret)
1268                 return ret;
1269
1270         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1271                 down_write(&block_group->data_rwsem);
1272                 spin_lock(&block_group->lock);
1273                 if (block_group->delalloc_bytes) {
1274                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1275                         spin_unlock(&block_group->lock);
1276                         up_write(&block_group->data_rwsem);
1277                         BTRFS_I(inode)->generation = 0;
1278                         ret = 0;
1279                         must_iput = 1;
1280                         goto out;
1281                 }
1282                 spin_unlock(&block_group->lock);
1283         }
1284
1285         /* Lock all pages first so we can lock the extent safely. */
1286         ret = io_ctl_prepare_pages(io_ctl, false);
1287         if (ret)
1288                 goto out_unlock;
1289
1290         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1291                          &cached_state);
1292
1293         io_ctl_set_generation(io_ctl, trans->transid);
1294
1295         mutex_lock(&ctl->cache_writeout_mutex);
1296         /* Write out the extent entries in the free space cache */
1297         spin_lock(&ctl->tree_lock);
1298         ret = write_cache_extent_entries(io_ctl, ctl,
1299                                          block_group, &entries, &bitmaps,
1300                                          &bitmap_list);
1301         if (ret)
1302                 goto out_nospc_locked;
1303
1304         /*
1305          * Some spaces that are freed in the current transaction are pinned,
1306          * they will be added into free space cache after the transaction is
1307          * committed, we shouldn't lose them.
1308          *
1309          * If this changes while we are working we'll get added back to
1310          * the dirty list and redo it.  No locking needed
1311          */
1312         ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1313         if (ret)
1314                 goto out_nospc_locked;
1315
1316         /*
1317          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1318          * locked while doing it because a concurrent trim can be manipulating
1319          * or freeing the bitmap.
1320          */
1321         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1322         spin_unlock(&ctl->tree_lock);
1323         mutex_unlock(&ctl->cache_writeout_mutex);
1324         if (ret)
1325                 goto out_nospc;
1326
1327         /* Zero out the rest of the pages just to make sure */
1328         io_ctl_zero_remaining_pages(io_ctl);
1329
1330         /* Everything is written out, now we dirty the pages in the file. */
1331         ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1332                                 io_ctl->num_pages, 0, i_size_read(inode),
1333                                 &cached_state, false);
1334         if (ret)
1335                 goto out_nospc;
1336
1337         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1338                 up_write(&block_group->data_rwsem);
1339         /*
1340          * Release the pages and unlock the extent, we will flush
1341          * them out later
1342          */
1343         io_ctl_drop_pages(io_ctl);
1344         io_ctl_free(io_ctl);
1345
1346         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1347                              i_size_read(inode) - 1, &cached_state);
1348
1349         /*
1350          * at this point the pages are under IO and we're happy,
1351          * The caller is responsible for waiting on them and updating
1352          * the cache and the inode
1353          */
1354         io_ctl->entries = entries;
1355         io_ctl->bitmaps = bitmaps;
1356
1357         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1358         if (ret)
1359                 goto out;
1360
1361         return 0;
1362
1363 out_nospc_locked:
1364         cleanup_bitmap_list(&bitmap_list);
1365         spin_unlock(&ctl->tree_lock);
1366         mutex_unlock(&ctl->cache_writeout_mutex);
1367
1368 out_nospc:
1369         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1370
1371 out_unlock:
1372         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1373                 up_write(&block_group->data_rwsem);
1374
1375 out:
1376         io_ctl->inode = NULL;
1377         io_ctl_free(io_ctl);
1378         if (ret) {
1379                 invalidate_inode_pages2(inode->i_mapping);
1380                 BTRFS_I(inode)->generation = 0;
1381         }
1382         btrfs_update_inode(trans, root, BTRFS_I(inode));
1383         if (must_iput)
1384                 iput(inode);
1385         return ret;
1386 }
1387
1388 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1389                           struct btrfs_block_group *block_group,
1390                           struct btrfs_path *path)
1391 {
1392         struct btrfs_fs_info *fs_info = trans->fs_info;
1393         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1394         struct inode *inode;
1395         int ret = 0;
1396
1397         spin_lock(&block_group->lock);
1398         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1399                 spin_unlock(&block_group->lock);
1400                 return 0;
1401         }
1402         spin_unlock(&block_group->lock);
1403
1404         inode = lookup_free_space_inode(block_group, path);
1405         if (IS_ERR(inode))
1406                 return 0;
1407
1408         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1409                                 block_group, &block_group->io_ctl, trans);
1410         if (ret) {
1411                 btrfs_debug(fs_info,
1412           "failed to write free space cache for block group %llu error %d",
1413                           block_group->start, ret);
1414                 spin_lock(&block_group->lock);
1415                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1416                 spin_unlock(&block_group->lock);
1417
1418                 block_group->io_ctl.inode = NULL;
1419                 iput(inode);
1420         }
1421
1422         /*
1423          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1424          * to wait for IO and put the inode
1425          */
1426
1427         return ret;
1428 }
1429
1430 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1431                                           u64 offset)
1432 {
1433         ASSERT(offset >= bitmap_start);
1434         offset -= bitmap_start;
1435         return (unsigned long)(div_u64(offset, unit));
1436 }
1437
1438 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1439 {
1440         return (unsigned long)(div_u64(bytes, unit));
1441 }
1442
1443 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1444                                    u64 offset)
1445 {
1446         u64 bitmap_start;
1447         u64 bytes_per_bitmap;
1448
1449         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1450         bitmap_start = offset - ctl->start;
1451         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1452         bitmap_start *= bytes_per_bitmap;
1453         bitmap_start += ctl->start;
1454
1455         return bitmap_start;
1456 }
1457
1458 static int tree_insert_offset(struct rb_root *root, u64 offset,
1459                               struct rb_node *node, int bitmap)
1460 {
1461         struct rb_node **p = &root->rb_node;
1462         struct rb_node *parent = NULL;
1463         struct btrfs_free_space *info;
1464
1465         while (*p) {
1466                 parent = *p;
1467                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1468
1469                 if (offset < info->offset) {
1470                         p = &(*p)->rb_left;
1471                 } else if (offset > info->offset) {
1472                         p = &(*p)->rb_right;
1473                 } else {
1474                         /*
1475                          * we could have a bitmap entry and an extent entry
1476                          * share the same offset.  If this is the case, we want
1477                          * the extent entry to always be found first if we do a
1478                          * linear search through the tree, since we want to have
1479                          * the quickest allocation time, and allocating from an
1480                          * extent is faster than allocating from a bitmap.  So
1481                          * if we're inserting a bitmap and we find an entry at
1482                          * this offset, we want to go right, or after this entry
1483                          * logically.  If we are inserting an extent and we've
1484                          * found a bitmap, we want to go left, or before
1485                          * logically.
1486                          */
1487                         if (bitmap) {
1488                                 if (info->bitmap) {
1489                                         WARN_ON_ONCE(1);
1490                                         return -EEXIST;
1491                                 }
1492                                 p = &(*p)->rb_right;
1493                         } else {
1494                                 if (!info->bitmap) {
1495                                         WARN_ON_ONCE(1);
1496                                         return -EEXIST;
1497                                 }
1498                                 p = &(*p)->rb_left;
1499                         }
1500                 }
1501         }
1502
1503         rb_link_node(node, parent, p);
1504         rb_insert_color(node, root);
1505
1506         return 0;
1507 }
1508
1509 /*
1510  * searches the tree for the given offset.
1511  *
1512  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1513  * want a section that has at least bytes size and comes at or after the given
1514  * offset.
1515  */
1516 static struct btrfs_free_space *
1517 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1518                    u64 offset, int bitmap_only, int fuzzy)
1519 {
1520         struct rb_node *n = ctl->free_space_offset.rb_node;
1521         struct btrfs_free_space *entry, *prev = NULL;
1522
1523         /* find entry that is closest to the 'offset' */
1524         while (1) {
1525                 if (!n) {
1526                         entry = NULL;
1527                         break;
1528                 }
1529
1530                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1531                 prev = entry;
1532
1533                 if (offset < entry->offset)
1534                         n = n->rb_left;
1535                 else if (offset > entry->offset)
1536                         n = n->rb_right;
1537                 else
1538                         break;
1539         }
1540
1541         if (bitmap_only) {
1542                 if (!entry)
1543                         return NULL;
1544                 if (entry->bitmap)
1545                         return entry;
1546
1547                 /*
1548                  * bitmap entry and extent entry may share same offset,
1549                  * in that case, bitmap entry comes after extent entry.
1550                  */
1551                 n = rb_next(n);
1552                 if (!n)
1553                         return NULL;
1554                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1555                 if (entry->offset != offset)
1556                         return NULL;
1557
1558                 WARN_ON(!entry->bitmap);
1559                 return entry;
1560         } else if (entry) {
1561                 if (entry->bitmap) {
1562                         /*
1563                          * if previous extent entry covers the offset,
1564                          * we should return it instead of the bitmap entry
1565                          */
1566                         n = rb_prev(&entry->offset_index);
1567                         if (n) {
1568                                 prev = rb_entry(n, struct btrfs_free_space,
1569                                                 offset_index);
1570                                 if (!prev->bitmap &&
1571                                     prev->offset + prev->bytes > offset)
1572                                         entry = prev;
1573                         }
1574                 }
1575                 return entry;
1576         }
1577
1578         if (!prev)
1579                 return NULL;
1580
1581         /* find last entry before the 'offset' */
1582         entry = prev;
1583         if (entry->offset > offset) {
1584                 n = rb_prev(&entry->offset_index);
1585                 if (n) {
1586                         entry = rb_entry(n, struct btrfs_free_space,
1587                                         offset_index);
1588                         ASSERT(entry->offset <= offset);
1589                 } else {
1590                         if (fuzzy)
1591                                 return entry;
1592                         else
1593                                 return NULL;
1594                 }
1595         }
1596
1597         if (entry->bitmap) {
1598                 n = rb_prev(&entry->offset_index);
1599                 if (n) {
1600                         prev = rb_entry(n, struct btrfs_free_space,
1601                                         offset_index);
1602                         if (!prev->bitmap &&
1603                             prev->offset + prev->bytes > offset)
1604                                 return prev;
1605                 }
1606                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1607                         return entry;
1608         } else if (entry->offset + entry->bytes > offset)
1609                 return entry;
1610
1611         if (!fuzzy)
1612                 return NULL;
1613
1614         while (1) {
1615                 if (entry->bitmap) {
1616                         if (entry->offset + BITS_PER_BITMAP *
1617                             ctl->unit > offset)
1618                                 break;
1619                 } else {
1620                         if (entry->offset + entry->bytes > offset)
1621                                 break;
1622                 }
1623
1624                 n = rb_next(&entry->offset_index);
1625                 if (!n)
1626                         return NULL;
1627                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1628         }
1629         return entry;
1630 }
1631
1632 static inline void
1633 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1634                     struct btrfs_free_space *info)
1635 {
1636         rb_erase(&info->offset_index, &ctl->free_space_offset);
1637         ctl->free_extents--;
1638
1639         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1640                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
1641                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1642         }
1643 }
1644
1645 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1646                               struct btrfs_free_space *info)
1647 {
1648         __unlink_free_space(ctl, info);
1649         ctl->free_space -= info->bytes;
1650 }
1651
1652 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1653                            struct btrfs_free_space *info)
1654 {
1655         int ret = 0;
1656
1657         ASSERT(info->bytes || info->bitmap);
1658         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1659                                  &info->offset_index, (info->bitmap != NULL));
1660         if (ret)
1661                 return ret;
1662
1663         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1664                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
1665                 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1666         }
1667
1668         ctl->free_space += info->bytes;
1669         ctl->free_extents++;
1670         return ret;
1671 }
1672
1673 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1674                                        struct btrfs_free_space *info,
1675                                        u64 offset, u64 bytes)
1676 {
1677         unsigned long start, count, end;
1678         int extent_delta = -1;
1679
1680         start = offset_to_bit(info->offset, ctl->unit, offset);
1681         count = bytes_to_bits(bytes, ctl->unit);
1682         end = start + count;
1683         ASSERT(end <= BITS_PER_BITMAP);
1684
1685         bitmap_clear(info->bitmap, start, count);
1686
1687         info->bytes -= bytes;
1688         if (info->max_extent_size > ctl->unit)
1689                 info->max_extent_size = 0;
1690
1691         if (start && test_bit(start - 1, info->bitmap))
1692                 extent_delta++;
1693
1694         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1695                 extent_delta++;
1696
1697         info->bitmap_extents += extent_delta;
1698         if (!btrfs_free_space_trimmed(info)) {
1699                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1700                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1701         }
1702 }
1703
1704 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1705                               struct btrfs_free_space *info, u64 offset,
1706                               u64 bytes)
1707 {
1708         __bitmap_clear_bits(ctl, info, offset, bytes);
1709         ctl->free_space -= bytes;
1710 }
1711
1712 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1713                             struct btrfs_free_space *info, u64 offset,
1714                             u64 bytes)
1715 {
1716         unsigned long start, count, end;
1717         int extent_delta = 1;
1718
1719         start = offset_to_bit(info->offset, ctl->unit, offset);
1720         count = bytes_to_bits(bytes, ctl->unit);
1721         end = start + count;
1722         ASSERT(end <= BITS_PER_BITMAP);
1723
1724         bitmap_set(info->bitmap, start, count);
1725
1726         info->bytes += bytes;
1727         ctl->free_space += bytes;
1728
1729         if (start && test_bit(start - 1, info->bitmap))
1730                 extent_delta--;
1731
1732         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1733                 extent_delta--;
1734
1735         info->bitmap_extents += extent_delta;
1736         if (!btrfs_free_space_trimmed(info)) {
1737                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1738                 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1739         }
1740 }
1741
1742 /*
1743  * If we can not find suitable extent, we will use bytes to record
1744  * the size of the max extent.
1745  */
1746 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1747                          struct btrfs_free_space *bitmap_info, u64 *offset,
1748                          u64 *bytes, bool for_alloc)
1749 {
1750         unsigned long found_bits = 0;
1751         unsigned long max_bits = 0;
1752         unsigned long bits, i;
1753         unsigned long next_zero;
1754         unsigned long extent_bits;
1755
1756         /*
1757          * Skip searching the bitmap if we don't have a contiguous section that
1758          * is large enough for this allocation.
1759          */
1760         if (for_alloc &&
1761             bitmap_info->max_extent_size &&
1762             bitmap_info->max_extent_size < *bytes) {
1763                 *bytes = bitmap_info->max_extent_size;
1764                 return -1;
1765         }
1766
1767         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1768                           max_t(u64, *offset, bitmap_info->offset));
1769         bits = bytes_to_bits(*bytes, ctl->unit);
1770
1771         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1772                 if (for_alloc && bits == 1) {
1773                         found_bits = 1;
1774                         break;
1775                 }
1776                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1777                                                BITS_PER_BITMAP, i);
1778                 extent_bits = next_zero - i;
1779                 if (extent_bits >= bits) {
1780                         found_bits = extent_bits;
1781                         break;
1782                 } else if (extent_bits > max_bits) {
1783                         max_bits = extent_bits;
1784                 }
1785                 i = next_zero;
1786         }
1787
1788         if (found_bits) {
1789                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1790                 *bytes = (u64)(found_bits) * ctl->unit;
1791                 return 0;
1792         }
1793
1794         *bytes = (u64)(max_bits) * ctl->unit;
1795         bitmap_info->max_extent_size = *bytes;
1796         return -1;
1797 }
1798
1799 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1800 {
1801         if (entry->bitmap)
1802                 return entry->max_extent_size;
1803         return entry->bytes;
1804 }
1805
1806 /* Cache the size of the max extent in bytes */
1807 static struct btrfs_free_space *
1808 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1809                 unsigned long align, u64 *max_extent_size)
1810 {
1811         struct btrfs_free_space *entry;
1812         struct rb_node *node;
1813         u64 tmp;
1814         u64 align_off;
1815         int ret;
1816
1817         if (!ctl->free_space_offset.rb_node)
1818                 goto out;
1819
1820         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1821         if (!entry)
1822                 goto out;
1823
1824         for (node = &entry->offset_index; node; node = rb_next(node)) {
1825                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1826                 if (entry->bytes < *bytes) {
1827                         *max_extent_size = max(get_max_extent_size(entry),
1828                                                *max_extent_size);
1829                         continue;
1830                 }
1831
1832                 /* make sure the space returned is big enough
1833                  * to match our requested alignment
1834                  */
1835                 if (*bytes >= align) {
1836                         tmp = entry->offset - ctl->start + align - 1;
1837                         tmp = div64_u64(tmp, align);
1838                         tmp = tmp * align + ctl->start;
1839                         align_off = tmp - entry->offset;
1840                 } else {
1841                         align_off = 0;
1842                         tmp = entry->offset;
1843                 }
1844
1845                 if (entry->bytes < *bytes + align_off) {
1846                         *max_extent_size = max(get_max_extent_size(entry),
1847                                                *max_extent_size);
1848                         continue;
1849                 }
1850
1851                 if (entry->bitmap) {
1852                         u64 size = *bytes;
1853
1854                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1855                         if (!ret) {
1856                                 *offset = tmp;
1857                                 *bytes = size;
1858                                 return entry;
1859                         } else {
1860                                 *max_extent_size =
1861                                         max(get_max_extent_size(entry),
1862                                             *max_extent_size);
1863                         }
1864                         continue;
1865                 }
1866
1867                 *offset = tmp;
1868                 *bytes = entry->bytes - align_off;
1869                 return entry;
1870         }
1871 out:
1872         return NULL;
1873 }
1874
1875 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1876                            struct btrfs_free_space *info, u64 offset)
1877 {
1878         info->offset = offset_to_bitmap(ctl, offset);
1879         info->bytes = 0;
1880         info->bitmap_extents = 0;
1881         INIT_LIST_HEAD(&info->list);
1882         link_free_space(ctl, info);
1883         ctl->total_bitmaps++;
1884         recalculate_thresholds(ctl);
1885 }
1886
1887 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1888                         struct btrfs_free_space *bitmap_info)
1889 {
1890         /*
1891          * Normally when this is called, the bitmap is completely empty. However,
1892          * if we are blowing up the free space cache for one reason or another
1893          * via __btrfs_remove_free_space_cache(), then it may not be freed and
1894          * we may leave stats on the table.
1895          */
1896         if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
1897                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
1898                         bitmap_info->bitmap_extents;
1899                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
1900
1901         }
1902         unlink_free_space(ctl, bitmap_info);
1903         kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1904         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1905         ctl->total_bitmaps--;
1906         recalculate_thresholds(ctl);
1907 }
1908
1909 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1910                               struct btrfs_free_space *bitmap_info,
1911                               u64 *offset, u64 *bytes)
1912 {
1913         u64 end;
1914         u64 search_start, search_bytes;
1915         int ret;
1916
1917 again:
1918         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1919
1920         /*
1921          * We need to search for bits in this bitmap.  We could only cover some
1922          * of the extent in this bitmap thanks to how we add space, so we need
1923          * to search for as much as it as we can and clear that amount, and then
1924          * go searching for the next bit.
1925          */
1926         search_start = *offset;
1927         search_bytes = ctl->unit;
1928         search_bytes = min(search_bytes, end - search_start + 1);
1929         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1930                             false);
1931         if (ret < 0 || search_start != *offset)
1932                 return -EINVAL;
1933
1934         /* We may have found more bits than what we need */
1935         search_bytes = min(search_bytes, *bytes);
1936
1937         /* Cannot clear past the end of the bitmap */
1938         search_bytes = min(search_bytes, end - search_start + 1);
1939
1940         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1941         *offset += search_bytes;
1942         *bytes -= search_bytes;
1943
1944         if (*bytes) {
1945                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1946                 if (!bitmap_info->bytes)
1947                         free_bitmap(ctl, bitmap_info);
1948
1949                 /*
1950                  * no entry after this bitmap, but we still have bytes to
1951                  * remove, so something has gone wrong.
1952                  */
1953                 if (!next)
1954                         return -EINVAL;
1955
1956                 bitmap_info = rb_entry(next, struct btrfs_free_space,
1957                                        offset_index);
1958
1959                 /*
1960                  * if the next entry isn't a bitmap we need to return to let the
1961                  * extent stuff do its work.
1962                  */
1963                 if (!bitmap_info->bitmap)
1964                         return -EAGAIN;
1965
1966                 /*
1967                  * Ok the next item is a bitmap, but it may not actually hold
1968                  * the information for the rest of this free space stuff, so
1969                  * look for it, and if we don't find it return so we can try
1970                  * everything over again.
1971                  */
1972                 search_start = *offset;
1973                 search_bytes = ctl->unit;
1974                 ret = search_bitmap(ctl, bitmap_info, &search_start,
1975                                     &search_bytes, false);
1976                 if (ret < 0 || search_start != *offset)
1977                         return -EAGAIN;
1978
1979                 goto again;
1980         } else if (!bitmap_info->bytes)
1981                 free_bitmap(ctl, bitmap_info);
1982
1983         return 0;
1984 }
1985
1986 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1987                                struct btrfs_free_space *info, u64 offset,
1988                                u64 bytes, enum btrfs_trim_state trim_state)
1989 {
1990         u64 bytes_to_set = 0;
1991         u64 end;
1992
1993         /*
1994          * This is a tradeoff to make bitmap trim state minimal.  We mark the
1995          * whole bitmap untrimmed if at any point we add untrimmed regions.
1996          */
1997         if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
1998                 if (btrfs_free_space_trimmed(info)) {
1999                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
2000                                 info->bitmap_extents;
2001                         ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2002                 }
2003                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2004         }
2005
2006         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2007
2008         bytes_to_set = min(end - offset, bytes);
2009
2010         bitmap_set_bits(ctl, info, offset, bytes_to_set);
2011
2012         /*
2013          * We set some bytes, we have no idea what the max extent size is
2014          * anymore.
2015          */
2016         info->max_extent_size = 0;
2017
2018         return bytes_to_set;
2019
2020 }
2021
2022 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2023                       struct btrfs_free_space *info)
2024 {
2025         struct btrfs_block_group *block_group = ctl->private;
2026         struct btrfs_fs_info *fs_info = block_group->fs_info;
2027         bool forced = false;
2028
2029 #ifdef CONFIG_BTRFS_DEBUG
2030         if (btrfs_should_fragment_free_space(block_group))
2031                 forced = true;
2032 #endif
2033
2034         /* This is a way to reclaim large regions from the bitmaps. */
2035         if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2036                 return false;
2037
2038         /*
2039          * If we are below the extents threshold then we can add this as an
2040          * extent, and don't have to deal with the bitmap
2041          */
2042         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2043                 /*
2044                  * If this block group has some small extents we don't want to
2045                  * use up all of our free slots in the cache with them, we want
2046                  * to reserve them to larger extents, however if we have plenty
2047                  * of cache left then go ahead an dadd them, no sense in adding
2048                  * the overhead of a bitmap if we don't have to.
2049                  */
2050                 if (info->bytes <= fs_info->sectorsize * 8) {
2051                         if (ctl->free_extents * 3 <= ctl->extents_thresh)
2052                                 return false;
2053                 } else {
2054                         return false;
2055                 }
2056         }
2057
2058         /*
2059          * The original block groups from mkfs can be really small, like 8
2060          * megabytes, so don't bother with a bitmap for those entries.  However
2061          * some block groups can be smaller than what a bitmap would cover but
2062          * are still large enough that they could overflow the 32k memory limit,
2063          * so allow those block groups to still be allowed to have a bitmap
2064          * entry.
2065          */
2066         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2067                 return false;
2068
2069         return true;
2070 }
2071
2072 static const struct btrfs_free_space_op free_space_op = {
2073         .use_bitmap             = use_bitmap,
2074 };
2075
2076 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2077                               struct btrfs_free_space *info)
2078 {
2079         struct btrfs_free_space *bitmap_info;
2080         struct btrfs_block_group *block_group = NULL;
2081         int added = 0;
2082         u64 bytes, offset, bytes_added;
2083         enum btrfs_trim_state trim_state;
2084         int ret;
2085
2086         bytes = info->bytes;
2087         offset = info->offset;
2088         trim_state = info->trim_state;
2089
2090         if (!ctl->op->use_bitmap(ctl, info))
2091                 return 0;
2092
2093         if (ctl->op == &free_space_op)
2094                 block_group = ctl->private;
2095 again:
2096         /*
2097          * Since we link bitmaps right into the cluster we need to see if we
2098          * have a cluster here, and if so and it has our bitmap we need to add
2099          * the free space to that bitmap.
2100          */
2101         if (block_group && !list_empty(&block_group->cluster_list)) {
2102                 struct btrfs_free_cluster *cluster;
2103                 struct rb_node *node;
2104                 struct btrfs_free_space *entry;
2105
2106                 cluster = list_entry(block_group->cluster_list.next,
2107                                      struct btrfs_free_cluster,
2108                                      block_group_list);
2109                 spin_lock(&cluster->lock);
2110                 node = rb_first(&cluster->root);
2111                 if (!node) {
2112                         spin_unlock(&cluster->lock);
2113                         goto no_cluster_bitmap;
2114                 }
2115
2116                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2117                 if (!entry->bitmap) {
2118                         spin_unlock(&cluster->lock);
2119                         goto no_cluster_bitmap;
2120                 }
2121
2122                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2123                         bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2124                                                           bytes, trim_state);
2125                         bytes -= bytes_added;
2126                         offset += bytes_added;
2127                 }
2128                 spin_unlock(&cluster->lock);
2129                 if (!bytes) {
2130                         ret = 1;
2131                         goto out;
2132                 }
2133         }
2134
2135 no_cluster_bitmap:
2136         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2137                                          1, 0);
2138         if (!bitmap_info) {
2139                 ASSERT(added == 0);
2140                 goto new_bitmap;
2141         }
2142
2143         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2144                                           trim_state);
2145         bytes -= bytes_added;
2146         offset += bytes_added;
2147         added = 0;
2148
2149         if (!bytes) {
2150                 ret = 1;
2151                 goto out;
2152         } else
2153                 goto again;
2154
2155 new_bitmap:
2156         if (info && info->bitmap) {
2157                 add_new_bitmap(ctl, info, offset);
2158                 added = 1;
2159                 info = NULL;
2160                 goto again;
2161         } else {
2162                 spin_unlock(&ctl->tree_lock);
2163
2164                 /* no pre-allocated info, allocate a new one */
2165                 if (!info) {
2166                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2167                                                  GFP_NOFS);
2168                         if (!info) {
2169                                 spin_lock(&ctl->tree_lock);
2170                                 ret = -ENOMEM;
2171                                 goto out;
2172                         }
2173                 }
2174
2175                 /* allocate the bitmap */
2176                 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2177                                                  GFP_NOFS);
2178                 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2179                 spin_lock(&ctl->tree_lock);
2180                 if (!info->bitmap) {
2181                         ret = -ENOMEM;
2182                         goto out;
2183                 }
2184                 goto again;
2185         }
2186
2187 out:
2188         if (info) {
2189                 if (info->bitmap)
2190                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2191                                         info->bitmap);
2192                 kmem_cache_free(btrfs_free_space_cachep, info);
2193         }
2194
2195         return ret;
2196 }
2197
2198 /*
2199  * Free space merging rules:
2200  *  1) Merge trimmed areas together
2201  *  2) Let untrimmed areas coalesce with trimmed areas
2202  *  3) Always pull neighboring regions from bitmaps
2203  *
2204  * The above rules are for when we merge free space based on btrfs_trim_state.
2205  * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2206  * same reason: to promote larger extent regions which makes life easier for
2207  * find_free_extent().  Rule 2 enables coalescing based on the common path
2208  * being returning free space from btrfs_finish_extent_commit().  So when free
2209  * space is trimmed, it will prevent aggregating trimmed new region and
2210  * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2211  * and provide find_free_extent() with the largest extents possible hoping for
2212  * the reuse path.
2213  */
2214 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2215                           struct btrfs_free_space *info, bool update_stat)
2216 {
2217         struct btrfs_free_space *left_info = NULL;
2218         struct btrfs_free_space *right_info;
2219         bool merged = false;
2220         u64 offset = info->offset;
2221         u64 bytes = info->bytes;
2222         const bool is_trimmed = btrfs_free_space_trimmed(info);
2223
2224         /*
2225          * first we want to see if there is free space adjacent to the range we
2226          * are adding, if there is remove that struct and add a new one to
2227          * cover the entire range
2228          */
2229         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2230         if (right_info && rb_prev(&right_info->offset_index))
2231                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2232                                      struct btrfs_free_space, offset_index);
2233         else if (!right_info)
2234                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2235
2236         /* See try_merge_free_space() comment. */
2237         if (right_info && !right_info->bitmap &&
2238             (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2239                 if (update_stat)
2240                         unlink_free_space(ctl, right_info);
2241                 else
2242                         __unlink_free_space(ctl, right_info);
2243                 info->bytes += right_info->bytes;
2244                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2245                 merged = true;
2246         }
2247
2248         /* See try_merge_free_space() comment. */
2249         if (left_info && !left_info->bitmap &&
2250             left_info->offset + left_info->bytes == offset &&
2251             (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2252                 if (update_stat)
2253                         unlink_free_space(ctl, left_info);
2254                 else
2255                         __unlink_free_space(ctl, left_info);
2256                 info->offset = left_info->offset;
2257                 info->bytes += left_info->bytes;
2258                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2259                 merged = true;
2260         }
2261
2262         return merged;
2263 }
2264
2265 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2266                                      struct btrfs_free_space *info,
2267                                      bool update_stat)
2268 {
2269         struct btrfs_free_space *bitmap;
2270         unsigned long i;
2271         unsigned long j;
2272         const u64 end = info->offset + info->bytes;
2273         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2274         u64 bytes;
2275
2276         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2277         if (!bitmap)
2278                 return false;
2279
2280         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2281         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2282         if (j == i)
2283                 return false;
2284         bytes = (j - i) * ctl->unit;
2285         info->bytes += bytes;
2286
2287         /* See try_merge_free_space() comment. */
2288         if (!btrfs_free_space_trimmed(bitmap))
2289                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2290
2291         if (update_stat)
2292                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2293         else
2294                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2295
2296         if (!bitmap->bytes)
2297                 free_bitmap(ctl, bitmap);
2298
2299         return true;
2300 }
2301
2302 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2303                                        struct btrfs_free_space *info,
2304                                        bool update_stat)
2305 {
2306         struct btrfs_free_space *bitmap;
2307         u64 bitmap_offset;
2308         unsigned long i;
2309         unsigned long j;
2310         unsigned long prev_j;
2311         u64 bytes;
2312
2313         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2314         /* If we're on a boundary, try the previous logical bitmap. */
2315         if (bitmap_offset == info->offset) {
2316                 if (info->offset == 0)
2317                         return false;
2318                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2319         }
2320
2321         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2322         if (!bitmap)
2323                 return false;
2324
2325         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2326         j = 0;
2327         prev_j = (unsigned long)-1;
2328         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2329                 if (j > i)
2330                         break;
2331                 prev_j = j;
2332         }
2333         if (prev_j == i)
2334                 return false;
2335
2336         if (prev_j == (unsigned long)-1)
2337                 bytes = (i + 1) * ctl->unit;
2338         else
2339                 bytes = (i - prev_j) * ctl->unit;
2340
2341         info->offset -= bytes;
2342         info->bytes += bytes;
2343
2344         /* See try_merge_free_space() comment. */
2345         if (!btrfs_free_space_trimmed(bitmap))
2346                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2347
2348         if (update_stat)
2349                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2350         else
2351                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2352
2353         if (!bitmap->bytes)
2354                 free_bitmap(ctl, bitmap);
2355
2356         return true;
2357 }
2358
2359 /*
2360  * We prefer always to allocate from extent entries, both for clustered and
2361  * non-clustered allocation requests. So when attempting to add a new extent
2362  * entry, try to see if there's adjacent free space in bitmap entries, and if
2363  * there is, migrate that space from the bitmaps to the extent.
2364  * Like this we get better chances of satisfying space allocation requests
2365  * because we attempt to satisfy them based on a single cache entry, and never
2366  * on 2 or more entries - even if the entries represent a contiguous free space
2367  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2368  * ends).
2369  */
2370 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2371                               struct btrfs_free_space *info,
2372                               bool update_stat)
2373 {
2374         /*
2375          * Only work with disconnected entries, as we can change their offset,
2376          * and must be extent entries.
2377          */
2378         ASSERT(!info->bitmap);
2379         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2380
2381         if (ctl->total_bitmaps > 0) {
2382                 bool stole_end;
2383                 bool stole_front = false;
2384
2385                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2386                 if (ctl->total_bitmaps > 0)
2387                         stole_front = steal_from_bitmap_to_front(ctl, info,
2388                                                                  update_stat);
2389
2390                 if (stole_end || stole_front)
2391                         try_merge_free_space(ctl, info, update_stat);
2392         }
2393 }
2394
2395 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2396                            struct btrfs_free_space_ctl *ctl,
2397                            u64 offset, u64 bytes,
2398                            enum btrfs_trim_state trim_state)
2399 {
2400         struct btrfs_block_group *block_group = ctl->private;
2401         struct btrfs_free_space *info;
2402         int ret = 0;
2403         u64 filter_bytes = bytes;
2404
2405         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2406         if (!info)
2407                 return -ENOMEM;
2408
2409         info->offset = offset;
2410         info->bytes = bytes;
2411         info->trim_state = trim_state;
2412         RB_CLEAR_NODE(&info->offset_index);
2413
2414         spin_lock(&ctl->tree_lock);
2415
2416         if (try_merge_free_space(ctl, info, true))
2417                 goto link;
2418
2419         /*
2420          * There was no extent directly to the left or right of this new
2421          * extent then we know we're going to have to allocate a new extent, so
2422          * before we do that see if we need to drop this into a bitmap
2423          */
2424         ret = insert_into_bitmap(ctl, info);
2425         if (ret < 0) {
2426                 goto out;
2427         } else if (ret) {
2428                 ret = 0;
2429                 goto out;
2430         }
2431 link:
2432         /*
2433          * Only steal free space from adjacent bitmaps if we're sure we're not
2434          * going to add the new free space to existing bitmap entries - because
2435          * that would mean unnecessary work that would be reverted. Therefore
2436          * attempt to steal space from bitmaps if we're adding an extent entry.
2437          */
2438         steal_from_bitmap(ctl, info, true);
2439
2440         filter_bytes = max(filter_bytes, info->bytes);
2441
2442         ret = link_free_space(ctl, info);
2443         if (ret)
2444                 kmem_cache_free(btrfs_free_space_cachep, info);
2445 out:
2446         btrfs_discard_update_discardable(block_group);
2447         spin_unlock(&ctl->tree_lock);
2448
2449         if (ret) {
2450                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2451                 ASSERT(ret != -EEXIST);
2452         }
2453
2454         if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2455                 btrfs_discard_check_filter(block_group, filter_bytes);
2456                 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2457         }
2458
2459         return ret;
2460 }
2461
2462 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2463                          u64 bytenr, u64 size)
2464 {
2465         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2466
2467         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2468                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2469
2470         return __btrfs_add_free_space(block_group->fs_info,
2471                                       block_group->free_space_ctl,
2472                                       bytenr, size, trim_state);
2473 }
2474
2475 /*
2476  * This is a subtle distinction because when adding free space back in general,
2477  * we want it to be added as untrimmed for async. But in the case where we add
2478  * it on loading of a block group, we want to consider it trimmed.
2479  */
2480 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2481                                        u64 bytenr, u64 size)
2482 {
2483         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2484
2485         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2486             btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2487                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2488
2489         return __btrfs_add_free_space(block_group->fs_info,
2490                                       block_group->free_space_ctl,
2491                                       bytenr, size, trim_state);
2492 }
2493
2494 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2495                             u64 offset, u64 bytes)
2496 {
2497         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2498         struct btrfs_free_space *info;
2499         int ret;
2500         bool re_search = false;
2501
2502         spin_lock(&ctl->tree_lock);
2503
2504 again:
2505         ret = 0;
2506         if (!bytes)
2507                 goto out_lock;
2508
2509         info = tree_search_offset(ctl, offset, 0, 0);
2510         if (!info) {
2511                 /*
2512                  * oops didn't find an extent that matched the space we wanted
2513                  * to remove, look for a bitmap instead
2514                  */
2515                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2516                                           1, 0);
2517                 if (!info) {
2518                         /*
2519                          * If we found a partial bit of our free space in a
2520                          * bitmap but then couldn't find the other part this may
2521                          * be a problem, so WARN about it.
2522                          */
2523                         WARN_ON(re_search);
2524                         goto out_lock;
2525                 }
2526         }
2527
2528         re_search = false;
2529         if (!info->bitmap) {
2530                 unlink_free_space(ctl, info);
2531                 if (offset == info->offset) {
2532                         u64 to_free = min(bytes, info->bytes);
2533
2534                         info->bytes -= to_free;
2535                         info->offset += to_free;
2536                         if (info->bytes) {
2537                                 ret = link_free_space(ctl, info);
2538                                 WARN_ON(ret);
2539                         } else {
2540                                 kmem_cache_free(btrfs_free_space_cachep, info);
2541                         }
2542
2543                         offset += to_free;
2544                         bytes -= to_free;
2545                         goto again;
2546                 } else {
2547                         u64 old_end = info->bytes + info->offset;
2548
2549                         info->bytes = offset - info->offset;
2550                         ret = link_free_space(ctl, info);
2551                         WARN_ON(ret);
2552                         if (ret)
2553                                 goto out_lock;
2554
2555                         /* Not enough bytes in this entry to satisfy us */
2556                         if (old_end < offset + bytes) {
2557                                 bytes -= old_end - offset;
2558                                 offset = old_end;
2559                                 goto again;
2560                         } else if (old_end == offset + bytes) {
2561                                 /* all done */
2562                                 goto out_lock;
2563                         }
2564                         spin_unlock(&ctl->tree_lock);
2565
2566                         ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2567                                                      offset + bytes,
2568                                                      old_end - (offset + bytes),
2569                                                      info->trim_state);
2570                         WARN_ON(ret);
2571                         goto out;
2572                 }
2573         }
2574
2575         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2576         if (ret == -EAGAIN) {
2577                 re_search = true;
2578                 goto again;
2579         }
2580 out_lock:
2581         btrfs_discard_update_discardable(block_group);
2582         spin_unlock(&ctl->tree_lock);
2583 out:
2584         return ret;
2585 }
2586
2587 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2588                            u64 bytes)
2589 {
2590         struct btrfs_fs_info *fs_info = block_group->fs_info;
2591         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2592         struct btrfs_free_space *info;
2593         struct rb_node *n;
2594         int count = 0;
2595
2596         spin_lock(&ctl->tree_lock);
2597         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2598                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2599                 if (info->bytes >= bytes && !block_group->ro)
2600                         count++;
2601                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2602                            info->offset, info->bytes,
2603                        (info->bitmap) ? "yes" : "no");
2604         }
2605         spin_unlock(&ctl->tree_lock);
2606         btrfs_info(fs_info, "block group has cluster?: %s",
2607                list_empty(&block_group->cluster_list) ? "no" : "yes");
2608         btrfs_info(fs_info,
2609                    "%d blocks of free space at or bigger than bytes is", count);
2610 }
2611
2612 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2613                                struct btrfs_free_space_ctl *ctl)
2614 {
2615         struct btrfs_fs_info *fs_info = block_group->fs_info;
2616
2617         spin_lock_init(&ctl->tree_lock);
2618         ctl->unit = fs_info->sectorsize;
2619         ctl->start = block_group->start;
2620         ctl->private = block_group;
2621         ctl->op = &free_space_op;
2622         INIT_LIST_HEAD(&ctl->trimming_ranges);
2623         mutex_init(&ctl->cache_writeout_mutex);
2624
2625         /*
2626          * we only want to have 32k of ram per block group for keeping
2627          * track of free space, and if we pass 1/2 of that we want to
2628          * start converting things over to using bitmaps
2629          */
2630         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2631 }
2632
2633 /*
2634  * for a given cluster, put all of its extents back into the free
2635  * space cache.  If the block group passed doesn't match the block group
2636  * pointed to by the cluster, someone else raced in and freed the
2637  * cluster already.  In that case, we just return without changing anything
2638  */
2639 static void __btrfs_return_cluster_to_free_space(
2640                              struct btrfs_block_group *block_group,
2641                              struct btrfs_free_cluster *cluster)
2642 {
2643         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2644         struct btrfs_free_space *entry;
2645         struct rb_node *node;
2646
2647         spin_lock(&cluster->lock);
2648         if (cluster->block_group != block_group)
2649                 goto out;
2650
2651         cluster->block_group = NULL;
2652         cluster->window_start = 0;
2653         list_del_init(&cluster->block_group_list);
2654
2655         node = rb_first(&cluster->root);
2656         while (node) {
2657                 bool bitmap;
2658
2659                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2660                 node = rb_next(&entry->offset_index);
2661                 rb_erase(&entry->offset_index, &cluster->root);
2662                 RB_CLEAR_NODE(&entry->offset_index);
2663
2664                 bitmap = (entry->bitmap != NULL);
2665                 if (!bitmap) {
2666                         /* Merging treats extents as if they were new */
2667                         if (!btrfs_free_space_trimmed(entry)) {
2668                                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
2669                                 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2670                                         entry->bytes;
2671                         }
2672
2673                         try_merge_free_space(ctl, entry, false);
2674                         steal_from_bitmap(ctl, entry, false);
2675
2676                         /* As we insert directly, update these statistics */
2677                         if (!btrfs_free_space_trimmed(entry)) {
2678                                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
2679                                 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2680                                         entry->bytes;
2681                         }
2682                 }
2683                 tree_insert_offset(&ctl->free_space_offset,
2684                                    entry->offset, &entry->offset_index, bitmap);
2685         }
2686         cluster->root = RB_ROOT;
2687
2688 out:
2689         spin_unlock(&cluster->lock);
2690         btrfs_put_block_group(block_group);
2691 }
2692
2693 static void __btrfs_remove_free_space_cache_locked(
2694                                 struct btrfs_free_space_ctl *ctl)
2695 {
2696         struct btrfs_free_space *info;
2697         struct rb_node *node;
2698
2699         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2700                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2701                 if (!info->bitmap) {
2702                         unlink_free_space(ctl, info);
2703                         kmem_cache_free(btrfs_free_space_cachep, info);
2704                 } else {
2705                         free_bitmap(ctl, info);
2706                 }
2707
2708                 cond_resched_lock(&ctl->tree_lock);
2709         }
2710 }
2711
2712 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2713 {
2714         spin_lock(&ctl->tree_lock);
2715         __btrfs_remove_free_space_cache_locked(ctl);
2716         if (ctl->private)
2717                 btrfs_discard_update_discardable(ctl->private);
2718         spin_unlock(&ctl->tree_lock);
2719 }
2720
2721 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2722 {
2723         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2724         struct btrfs_free_cluster *cluster;
2725         struct list_head *head;
2726
2727         spin_lock(&ctl->tree_lock);
2728         while ((head = block_group->cluster_list.next) !=
2729                &block_group->cluster_list) {
2730                 cluster = list_entry(head, struct btrfs_free_cluster,
2731                                      block_group_list);
2732
2733                 WARN_ON(cluster->block_group != block_group);
2734                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2735
2736                 cond_resched_lock(&ctl->tree_lock);
2737         }
2738         __btrfs_remove_free_space_cache_locked(ctl);
2739         btrfs_discard_update_discardable(block_group);
2740         spin_unlock(&ctl->tree_lock);
2741
2742 }
2743
2744 /**
2745  * btrfs_is_free_space_trimmed - see if everything is trimmed
2746  * @block_group: block_group of interest
2747  *
2748  * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2749  */
2750 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2751 {
2752         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2753         struct btrfs_free_space *info;
2754         struct rb_node *node;
2755         bool ret = true;
2756
2757         spin_lock(&ctl->tree_lock);
2758         node = rb_first(&ctl->free_space_offset);
2759
2760         while (node) {
2761                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2762
2763                 if (!btrfs_free_space_trimmed(info)) {
2764                         ret = false;
2765                         break;
2766                 }
2767
2768                 node = rb_next(node);
2769         }
2770
2771         spin_unlock(&ctl->tree_lock);
2772         return ret;
2773 }
2774
2775 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2776                                u64 offset, u64 bytes, u64 empty_size,
2777                                u64 *max_extent_size)
2778 {
2779         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2780         struct btrfs_discard_ctl *discard_ctl =
2781                                         &block_group->fs_info->discard_ctl;
2782         struct btrfs_free_space *entry = NULL;
2783         u64 bytes_search = bytes + empty_size;
2784         u64 ret = 0;
2785         u64 align_gap = 0;
2786         u64 align_gap_len = 0;
2787         enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2788
2789         spin_lock(&ctl->tree_lock);
2790         entry = find_free_space(ctl, &offset, &bytes_search,
2791                                 block_group->full_stripe_len, max_extent_size);
2792         if (!entry)
2793                 goto out;
2794
2795         ret = offset;
2796         if (entry->bitmap) {
2797                 bitmap_clear_bits(ctl, entry, offset, bytes);
2798
2799                 if (!btrfs_free_space_trimmed(entry))
2800                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2801
2802                 if (!entry->bytes)
2803                         free_bitmap(ctl, entry);
2804         } else {
2805                 unlink_free_space(ctl, entry);
2806                 align_gap_len = offset - entry->offset;
2807                 align_gap = entry->offset;
2808                 align_gap_trim_state = entry->trim_state;
2809
2810                 if (!btrfs_free_space_trimmed(entry))
2811                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2812
2813                 entry->offset = offset + bytes;
2814                 WARN_ON(entry->bytes < bytes + align_gap_len);
2815
2816                 entry->bytes -= bytes + align_gap_len;
2817                 if (!entry->bytes)
2818                         kmem_cache_free(btrfs_free_space_cachep, entry);
2819                 else
2820                         link_free_space(ctl, entry);
2821         }
2822 out:
2823         btrfs_discard_update_discardable(block_group);
2824         spin_unlock(&ctl->tree_lock);
2825
2826         if (align_gap_len)
2827                 __btrfs_add_free_space(block_group->fs_info, ctl,
2828                                        align_gap, align_gap_len,
2829                                        align_gap_trim_state);
2830         return ret;
2831 }
2832
2833 /*
2834  * given a cluster, put all of its extents back into the free space
2835  * cache.  If a block group is passed, this function will only free
2836  * a cluster that belongs to the passed block group.
2837  *
2838  * Otherwise, it'll get a reference on the block group pointed to by the
2839  * cluster and remove the cluster from it.
2840  */
2841 void btrfs_return_cluster_to_free_space(
2842                                struct btrfs_block_group *block_group,
2843                                struct btrfs_free_cluster *cluster)
2844 {
2845         struct btrfs_free_space_ctl *ctl;
2846
2847         /* first, get a safe pointer to the block group */
2848         spin_lock(&cluster->lock);
2849         if (!block_group) {
2850                 block_group = cluster->block_group;
2851                 if (!block_group) {
2852                         spin_unlock(&cluster->lock);
2853                         return;
2854                 }
2855         } else if (cluster->block_group != block_group) {
2856                 /* someone else has already freed it don't redo their work */
2857                 spin_unlock(&cluster->lock);
2858                 return;
2859         }
2860         btrfs_get_block_group(block_group);
2861         spin_unlock(&cluster->lock);
2862
2863         ctl = block_group->free_space_ctl;
2864
2865         /* now return any extents the cluster had on it */
2866         spin_lock(&ctl->tree_lock);
2867         __btrfs_return_cluster_to_free_space(block_group, cluster);
2868         spin_unlock(&ctl->tree_lock);
2869
2870         btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
2871
2872         /* finally drop our ref */
2873         btrfs_put_block_group(block_group);
2874 }
2875
2876 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
2877                                    struct btrfs_free_cluster *cluster,
2878                                    struct btrfs_free_space *entry,
2879                                    u64 bytes, u64 min_start,
2880                                    u64 *max_extent_size)
2881 {
2882         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2883         int err;
2884         u64 search_start = cluster->window_start;
2885         u64 search_bytes = bytes;
2886         u64 ret = 0;
2887
2888         search_start = min_start;
2889         search_bytes = bytes;
2890
2891         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2892         if (err) {
2893                 *max_extent_size = max(get_max_extent_size(entry),
2894                                        *max_extent_size);
2895                 return 0;
2896         }
2897
2898         ret = search_start;
2899         __bitmap_clear_bits(ctl, entry, ret, bytes);
2900
2901         return ret;
2902 }
2903
2904 /*
2905  * given a cluster, try to allocate 'bytes' from it, returns 0
2906  * if it couldn't find anything suitably large, or a logical disk offset
2907  * if things worked out
2908  */
2909 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
2910                              struct btrfs_free_cluster *cluster, u64 bytes,
2911                              u64 min_start, u64 *max_extent_size)
2912 {
2913         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2914         struct btrfs_discard_ctl *discard_ctl =
2915                                         &block_group->fs_info->discard_ctl;
2916         struct btrfs_free_space *entry = NULL;
2917         struct rb_node *node;
2918         u64 ret = 0;
2919
2920         spin_lock(&cluster->lock);
2921         if (bytes > cluster->max_size)
2922                 goto out;
2923
2924         if (cluster->block_group != block_group)
2925                 goto out;
2926
2927         node = rb_first(&cluster->root);
2928         if (!node)
2929                 goto out;
2930
2931         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2932         while (1) {
2933                 if (entry->bytes < bytes)
2934                         *max_extent_size = max(get_max_extent_size(entry),
2935                                                *max_extent_size);
2936
2937                 if (entry->bytes < bytes ||
2938                     (!entry->bitmap && entry->offset < min_start)) {
2939                         node = rb_next(&entry->offset_index);
2940                         if (!node)
2941                                 break;
2942                         entry = rb_entry(node, struct btrfs_free_space,
2943                                          offset_index);
2944                         continue;
2945                 }
2946
2947                 if (entry->bitmap) {
2948                         ret = btrfs_alloc_from_bitmap(block_group,
2949                                                       cluster, entry, bytes,
2950                                                       cluster->window_start,
2951                                                       max_extent_size);
2952                         if (ret == 0) {
2953                                 node = rb_next(&entry->offset_index);
2954                                 if (!node)
2955                                         break;
2956                                 entry = rb_entry(node, struct btrfs_free_space,
2957                                                  offset_index);
2958                                 continue;
2959                         }
2960                         cluster->window_start += bytes;
2961                 } else {
2962                         ret = entry->offset;
2963
2964                         entry->offset += bytes;
2965                         entry->bytes -= bytes;
2966                 }
2967
2968                 if (entry->bytes == 0)
2969                         rb_erase(&entry->offset_index, &cluster->root);
2970                 break;
2971         }
2972 out:
2973         spin_unlock(&cluster->lock);
2974
2975         if (!ret)
2976                 return 0;
2977
2978         spin_lock(&ctl->tree_lock);
2979
2980         if (!btrfs_free_space_trimmed(entry))
2981                 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2982
2983         ctl->free_space -= bytes;
2984         if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
2985                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
2986         if (entry->bytes == 0) {
2987                 ctl->free_extents--;
2988                 if (entry->bitmap) {
2989                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2990                                         entry->bitmap);
2991                         ctl->total_bitmaps--;
2992                         recalculate_thresholds(ctl);
2993                 } else if (!btrfs_free_space_trimmed(entry)) {
2994                         ctl->discardable_extents[BTRFS_STAT_CURR]--;
2995                 }
2996                 kmem_cache_free(btrfs_free_space_cachep, entry);
2997         }
2998
2999         spin_unlock(&ctl->tree_lock);
3000
3001         return ret;
3002 }
3003
3004 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3005                                 struct btrfs_free_space *entry,
3006                                 struct btrfs_free_cluster *cluster,
3007                                 u64 offset, u64 bytes,
3008                                 u64 cont1_bytes, u64 min_bytes)
3009 {
3010         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3011         unsigned long next_zero;
3012         unsigned long i;
3013         unsigned long want_bits;
3014         unsigned long min_bits;
3015         unsigned long found_bits;
3016         unsigned long max_bits = 0;
3017         unsigned long start = 0;
3018         unsigned long total_found = 0;
3019         int ret;
3020
3021         i = offset_to_bit(entry->offset, ctl->unit,
3022                           max_t(u64, offset, entry->offset));
3023         want_bits = bytes_to_bits(bytes, ctl->unit);
3024         min_bits = bytes_to_bits(min_bytes, ctl->unit);
3025
3026         /*
3027          * Don't bother looking for a cluster in this bitmap if it's heavily
3028          * fragmented.
3029          */
3030         if (entry->max_extent_size &&
3031             entry->max_extent_size < cont1_bytes)
3032                 return -ENOSPC;
3033 again:
3034         found_bits = 0;
3035         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3036                 next_zero = find_next_zero_bit(entry->bitmap,
3037                                                BITS_PER_BITMAP, i);
3038                 if (next_zero - i >= min_bits) {
3039                         found_bits = next_zero - i;
3040                         if (found_bits > max_bits)
3041                                 max_bits = found_bits;
3042                         break;
3043                 }
3044                 if (next_zero - i > max_bits)
3045                         max_bits = next_zero - i;
3046                 i = next_zero;
3047         }
3048
3049         if (!found_bits) {
3050                 entry->max_extent_size = (u64)max_bits * ctl->unit;
3051                 return -ENOSPC;
3052         }
3053
3054         if (!total_found) {
3055                 start = i;
3056                 cluster->max_size = 0;
3057         }
3058
3059         total_found += found_bits;
3060
3061         if (cluster->max_size < found_bits * ctl->unit)
3062                 cluster->max_size = found_bits * ctl->unit;
3063
3064         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3065                 i = next_zero + 1;
3066                 goto again;
3067         }
3068
3069         cluster->window_start = start * ctl->unit + entry->offset;
3070         rb_erase(&entry->offset_index, &ctl->free_space_offset);
3071         ret = tree_insert_offset(&cluster->root, entry->offset,
3072                                  &entry->offset_index, 1);
3073         ASSERT(!ret); /* -EEXIST; Logic error */
3074
3075         trace_btrfs_setup_cluster(block_group, cluster,
3076                                   total_found * ctl->unit, 1);
3077         return 0;
3078 }
3079
3080 /*
3081  * This searches the block group for just extents to fill the cluster with.
3082  * Try to find a cluster with at least bytes total bytes, at least one
3083  * extent of cont1_bytes, and other clusters of at least min_bytes.
3084  */
3085 static noinline int
3086 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3087                         struct btrfs_free_cluster *cluster,
3088                         struct list_head *bitmaps, u64 offset, u64 bytes,
3089                         u64 cont1_bytes, u64 min_bytes)
3090 {
3091         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3092         struct btrfs_free_space *first = NULL;
3093         struct btrfs_free_space *entry = NULL;
3094         struct btrfs_free_space *last;
3095         struct rb_node *node;
3096         u64 window_free;
3097         u64 max_extent;
3098         u64 total_size = 0;
3099
3100         entry = tree_search_offset(ctl, offset, 0, 1);
3101         if (!entry)
3102                 return -ENOSPC;
3103
3104         /*
3105          * We don't want bitmaps, so just move along until we find a normal
3106          * extent entry.
3107          */
3108         while (entry->bitmap || entry->bytes < min_bytes) {
3109                 if (entry->bitmap && list_empty(&entry->list))
3110                         list_add_tail(&entry->list, bitmaps);
3111                 node = rb_next(&entry->offset_index);
3112                 if (!node)
3113                         return -ENOSPC;
3114                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3115         }
3116
3117         window_free = entry->bytes;
3118         max_extent = entry->bytes;
3119         first = entry;
3120         last = entry;
3121
3122         for (node = rb_next(&entry->offset_index); node;
3123              node = rb_next(&entry->offset_index)) {
3124                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3125
3126                 if (entry->bitmap) {
3127                         if (list_empty(&entry->list))
3128                                 list_add_tail(&entry->list, bitmaps);
3129                         continue;
3130                 }
3131
3132                 if (entry->bytes < min_bytes)
3133                         continue;
3134
3135                 last = entry;
3136                 window_free += entry->bytes;
3137                 if (entry->bytes > max_extent)
3138                         max_extent = entry->bytes;
3139         }
3140
3141         if (window_free < bytes || max_extent < cont1_bytes)
3142                 return -ENOSPC;
3143
3144         cluster->window_start = first->offset;
3145
3146         node = &first->offset_index;
3147
3148         /*
3149          * now we've found our entries, pull them out of the free space
3150          * cache and put them into the cluster rbtree
3151          */
3152         do {
3153                 int ret;
3154
3155                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3156                 node = rb_next(&entry->offset_index);
3157                 if (entry->bitmap || entry->bytes < min_bytes)
3158                         continue;
3159
3160                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3161                 ret = tree_insert_offset(&cluster->root, entry->offset,
3162                                          &entry->offset_index, 0);
3163                 total_size += entry->bytes;
3164                 ASSERT(!ret); /* -EEXIST; Logic error */
3165         } while (node && entry != last);
3166
3167         cluster->max_size = max_extent;
3168         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3169         return 0;
3170 }
3171
3172 /*
3173  * This specifically looks for bitmaps that may work in the cluster, we assume
3174  * that we have already failed to find extents that will work.
3175  */
3176 static noinline int
3177 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3178                      struct btrfs_free_cluster *cluster,
3179                      struct list_head *bitmaps, u64 offset, u64 bytes,
3180                      u64 cont1_bytes, u64 min_bytes)
3181 {
3182         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3183         struct btrfs_free_space *entry = NULL;
3184         int ret = -ENOSPC;
3185         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3186
3187         if (ctl->total_bitmaps == 0)
3188                 return -ENOSPC;
3189
3190         /*
3191          * The bitmap that covers offset won't be in the list unless offset
3192          * is just its start offset.
3193          */
3194         if (!list_empty(bitmaps))
3195                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3196
3197         if (!entry || entry->offset != bitmap_offset) {
3198                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3199                 if (entry && list_empty(&entry->list))
3200                         list_add(&entry->list, bitmaps);
3201         }
3202
3203         list_for_each_entry(entry, bitmaps, list) {
3204                 if (entry->bytes < bytes)
3205                         continue;
3206                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3207                                            bytes, cont1_bytes, min_bytes);
3208                 if (!ret)
3209                         return 0;
3210         }
3211
3212         /*
3213          * The bitmaps list has all the bitmaps that record free space
3214          * starting after offset, so no more search is required.
3215          */
3216         return -ENOSPC;
3217 }
3218
3219 /*
3220  * here we try to find a cluster of blocks in a block group.  The goal
3221  * is to find at least bytes+empty_size.
3222  * We might not find them all in one contiguous area.
3223  *
3224  * returns zero and sets up cluster if things worked out, otherwise
3225  * it returns -enospc
3226  */
3227 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3228                              struct btrfs_free_cluster *cluster,
3229                              u64 offset, u64 bytes, u64 empty_size)
3230 {
3231         struct btrfs_fs_info *fs_info = block_group->fs_info;
3232         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3233         struct btrfs_free_space *entry, *tmp;
3234         LIST_HEAD(bitmaps);
3235         u64 min_bytes;
3236         u64 cont1_bytes;
3237         int ret;
3238
3239         /*
3240          * Choose the minimum extent size we'll require for this
3241          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3242          * For metadata, allow allocates with smaller extents.  For
3243          * data, keep it dense.
3244          */
3245         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3246                 cont1_bytes = min_bytes = bytes + empty_size;
3247         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3248                 cont1_bytes = bytes;
3249                 min_bytes = fs_info->sectorsize;
3250         } else {
3251                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3252                 min_bytes = fs_info->sectorsize;
3253         }
3254
3255         spin_lock(&ctl->tree_lock);
3256
3257         /*
3258          * If we know we don't have enough space to make a cluster don't even
3259          * bother doing all the work to try and find one.
3260          */
3261         if (ctl->free_space < bytes) {
3262                 spin_unlock(&ctl->tree_lock);
3263                 return -ENOSPC;
3264         }
3265
3266         spin_lock(&cluster->lock);
3267
3268         /* someone already found a cluster, hooray */
3269         if (cluster->block_group) {
3270                 ret = 0;
3271                 goto out;
3272         }
3273
3274         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3275                                  min_bytes);
3276
3277         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3278                                       bytes + empty_size,
3279                                       cont1_bytes, min_bytes);
3280         if (ret)
3281                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3282                                            offset, bytes + empty_size,
3283                                            cont1_bytes, min_bytes);
3284
3285         /* Clear our temporary list */
3286         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3287                 list_del_init(&entry->list);
3288
3289         if (!ret) {
3290                 btrfs_get_block_group(block_group);
3291                 list_add_tail(&cluster->block_group_list,
3292                               &block_group->cluster_list);
3293                 cluster->block_group = block_group;
3294         } else {
3295                 trace_btrfs_failed_cluster_setup(block_group);
3296         }
3297 out:
3298         spin_unlock(&cluster->lock);
3299         spin_unlock(&ctl->tree_lock);
3300
3301         return ret;
3302 }
3303
3304 /*
3305  * simple code to zero out a cluster
3306  */
3307 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3308 {
3309         spin_lock_init(&cluster->lock);
3310         spin_lock_init(&cluster->refill_lock);
3311         cluster->root = RB_ROOT;
3312         cluster->max_size = 0;
3313         cluster->fragmented = false;
3314         INIT_LIST_HEAD(&cluster->block_group_list);
3315         cluster->block_group = NULL;
3316 }
3317
3318 static int do_trimming(struct btrfs_block_group *block_group,
3319                        u64 *total_trimmed, u64 start, u64 bytes,
3320                        u64 reserved_start, u64 reserved_bytes,
3321                        enum btrfs_trim_state reserved_trim_state,
3322                        struct btrfs_trim_range *trim_entry)
3323 {
3324         struct btrfs_space_info *space_info = block_group->space_info;
3325         struct btrfs_fs_info *fs_info = block_group->fs_info;
3326         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3327         int ret;
3328         int update = 0;
3329         const u64 end = start + bytes;
3330         const u64 reserved_end = reserved_start + reserved_bytes;
3331         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3332         u64 trimmed = 0;
3333
3334         spin_lock(&space_info->lock);
3335         spin_lock(&block_group->lock);
3336         if (!block_group->ro) {
3337                 block_group->reserved += reserved_bytes;
3338                 space_info->bytes_reserved += reserved_bytes;
3339                 update = 1;
3340         }
3341         spin_unlock(&block_group->lock);
3342         spin_unlock(&space_info->lock);
3343
3344         ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3345         if (!ret) {
3346                 *total_trimmed += trimmed;
3347                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3348         }
3349
3350         mutex_lock(&ctl->cache_writeout_mutex);
3351         if (reserved_start < start)
3352                 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3353                                        start - reserved_start,
3354                                        reserved_trim_state);
3355         if (start + bytes < reserved_start + reserved_bytes)
3356                 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3357                                        reserved_trim_state);
3358         __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3359         list_del(&trim_entry->list);
3360         mutex_unlock(&ctl->cache_writeout_mutex);
3361
3362         if (update) {
3363                 spin_lock(&space_info->lock);
3364                 spin_lock(&block_group->lock);
3365                 if (block_group->ro)
3366                         space_info->bytes_readonly += reserved_bytes;
3367                 block_group->reserved -= reserved_bytes;
3368                 space_info->bytes_reserved -= reserved_bytes;
3369                 spin_unlock(&block_group->lock);
3370                 spin_unlock(&space_info->lock);
3371         }
3372
3373         return ret;
3374 }
3375
3376 /*
3377  * If @async is set, then we will trim 1 region and return.
3378  */
3379 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3380                           u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3381                           bool async)
3382 {
3383         struct btrfs_discard_ctl *discard_ctl =
3384                                         &block_group->fs_info->discard_ctl;
3385         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3386         struct btrfs_free_space *entry;
3387         struct rb_node *node;
3388         int ret = 0;
3389         u64 extent_start;
3390         u64 extent_bytes;
3391         enum btrfs_trim_state extent_trim_state;
3392         u64 bytes;
3393         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3394
3395         while (start < end) {
3396                 struct btrfs_trim_range trim_entry;
3397
3398                 mutex_lock(&ctl->cache_writeout_mutex);
3399                 spin_lock(&ctl->tree_lock);
3400
3401                 if (ctl->free_space < minlen)
3402                         goto out_unlock;
3403
3404                 entry = tree_search_offset(ctl, start, 0, 1);
3405                 if (!entry)
3406                         goto out_unlock;
3407
3408                 /* Skip bitmaps and if async, already trimmed entries */
3409                 while (entry->bitmap ||
3410                        (async && btrfs_free_space_trimmed(entry))) {
3411                         node = rb_next(&entry->offset_index);
3412                         if (!node)
3413                                 goto out_unlock;
3414                         entry = rb_entry(node, struct btrfs_free_space,
3415                                          offset_index);
3416                 }
3417
3418                 if (entry->offset >= end)
3419                         goto out_unlock;
3420
3421                 extent_start = entry->offset;
3422                 extent_bytes = entry->bytes;
3423                 extent_trim_state = entry->trim_state;
3424                 if (async) {
3425                         start = entry->offset;
3426                         bytes = entry->bytes;
3427                         if (bytes < minlen) {
3428                                 spin_unlock(&ctl->tree_lock);
3429                                 mutex_unlock(&ctl->cache_writeout_mutex);
3430                                 goto next;
3431                         }
3432                         unlink_free_space(ctl, entry);
3433                         /*
3434                          * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3435                          * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3436                          * X when we come back around.  So trim it now.
3437                          */
3438                         if (max_discard_size &&
3439                             bytes >= (max_discard_size +
3440                                       BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3441                                 bytes = max_discard_size;
3442                                 extent_bytes = max_discard_size;
3443                                 entry->offset += max_discard_size;
3444                                 entry->bytes -= max_discard_size;
3445                                 link_free_space(ctl, entry);
3446                         } else {
3447                                 kmem_cache_free(btrfs_free_space_cachep, entry);
3448                         }
3449                 } else {
3450                         start = max(start, extent_start);
3451                         bytes = min(extent_start + extent_bytes, end) - start;
3452                         if (bytes < minlen) {
3453                                 spin_unlock(&ctl->tree_lock);
3454                                 mutex_unlock(&ctl->cache_writeout_mutex);
3455                                 goto next;
3456                         }
3457
3458                         unlink_free_space(ctl, entry);
3459                         kmem_cache_free(btrfs_free_space_cachep, entry);
3460                 }
3461
3462                 spin_unlock(&ctl->tree_lock);
3463                 trim_entry.start = extent_start;
3464                 trim_entry.bytes = extent_bytes;
3465                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3466                 mutex_unlock(&ctl->cache_writeout_mutex);
3467
3468                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3469                                   extent_start, extent_bytes, extent_trim_state,
3470                                   &trim_entry);
3471                 if (ret) {
3472                         block_group->discard_cursor = start + bytes;
3473                         break;
3474                 }
3475 next:
3476                 start += bytes;
3477                 block_group->discard_cursor = start;
3478                 if (async && *total_trimmed)
3479                         break;
3480
3481                 if (fatal_signal_pending(current)) {
3482                         ret = -ERESTARTSYS;
3483                         break;
3484                 }
3485
3486                 cond_resched();
3487         }
3488
3489         return ret;
3490
3491 out_unlock:
3492         block_group->discard_cursor = btrfs_block_group_end(block_group);
3493         spin_unlock(&ctl->tree_lock);
3494         mutex_unlock(&ctl->cache_writeout_mutex);
3495
3496         return ret;
3497 }
3498
3499 /*
3500  * If we break out of trimming a bitmap prematurely, we should reset the
3501  * trimming bit.  In a rather contrieved case, it's possible to race here so
3502  * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3503  *
3504  * start = start of bitmap
3505  * end = near end of bitmap
3506  *
3507  * Thread 1:                    Thread 2:
3508  * trim_bitmaps(start)
3509  *                              trim_bitmaps(end)
3510  *                              end_trimming_bitmap()
3511  * reset_trimming_bitmap()
3512  */
3513 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3514 {
3515         struct btrfs_free_space *entry;
3516
3517         spin_lock(&ctl->tree_lock);
3518         entry = tree_search_offset(ctl, offset, 1, 0);
3519         if (entry) {
3520                 if (btrfs_free_space_trimmed(entry)) {
3521                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
3522                                 entry->bitmap_extents;
3523                         ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3524                 }
3525                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3526         }
3527
3528         spin_unlock(&ctl->tree_lock);
3529 }
3530
3531 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3532                                 struct btrfs_free_space *entry)
3533 {
3534         if (btrfs_free_space_trimming_bitmap(entry)) {
3535                 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3536                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3537                         entry->bitmap_extents;
3538                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3539         }
3540 }
3541
3542 /*
3543  * If @async is set, then we will trim 1 region and return.
3544  */
3545 static int trim_bitmaps(struct btrfs_block_group *block_group,
3546                         u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3547                         u64 maxlen, bool async)
3548 {
3549         struct btrfs_discard_ctl *discard_ctl =
3550                                         &block_group->fs_info->discard_ctl;
3551         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3552         struct btrfs_free_space *entry;
3553         int ret = 0;
3554         int ret2;
3555         u64 bytes;
3556         u64 offset = offset_to_bitmap(ctl, start);
3557         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3558
3559         while (offset < end) {
3560                 bool next_bitmap = false;
3561                 struct btrfs_trim_range trim_entry;
3562
3563                 mutex_lock(&ctl->cache_writeout_mutex);
3564                 spin_lock(&ctl->tree_lock);
3565
3566                 if (ctl->free_space < minlen) {
3567                         block_group->discard_cursor =
3568                                 btrfs_block_group_end(block_group);
3569                         spin_unlock(&ctl->tree_lock);
3570                         mutex_unlock(&ctl->cache_writeout_mutex);
3571                         break;
3572                 }
3573
3574                 entry = tree_search_offset(ctl, offset, 1, 0);
3575                 /*
3576                  * Bitmaps are marked trimmed lossily now to prevent constant
3577                  * discarding of the same bitmap (the reason why we are bound
3578                  * by the filters).  So, retrim the block group bitmaps when we
3579                  * are preparing to punt to the unused_bgs list.  This uses
3580                  * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3581                  * which is the only discard index which sets minlen to 0.
3582                  */
3583                 if (!entry || (async && minlen && start == offset &&
3584                                btrfs_free_space_trimmed(entry))) {
3585                         spin_unlock(&ctl->tree_lock);
3586                         mutex_unlock(&ctl->cache_writeout_mutex);
3587                         next_bitmap = true;
3588                         goto next;
3589                 }
3590
3591                 /*
3592                  * Async discard bitmap trimming begins at by setting the start
3593                  * to be key.objectid and the offset_to_bitmap() aligns to the
3594                  * start of the bitmap.  This lets us know we are fully
3595                  * scanning the bitmap rather than only some portion of it.
3596                  */
3597                 if (start == offset)
3598                         entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3599
3600                 bytes = minlen;
3601                 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3602                 if (ret2 || start >= end) {
3603                         /*
3604                          * We lossily consider a bitmap trimmed if we only skip
3605                          * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3606                          */
3607                         if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3608                                 end_trimming_bitmap(ctl, entry);
3609                         else
3610                                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3611                         spin_unlock(&ctl->tree_lock);
3612                         mutex_unlock(&ctl->cache_writeout_mutex);
3613                         next_bitmap = true;
3614                         goto next;
3615                 }
3616
3617                 /*
3618                  * We already trimmed a region, but are using the locking above
3619                  * to reset the trim_state.
3620                  */
3621                 if (async && *total_trimmed) {
3622                         spin_unlock(&ctl->tree_lock);
3623                         mutex_unlock(&ctl->cache_writeout_mutex);
3624                         goto out;
3625                 }
3626
3627                 bytes = min(bytes, end - start);
3628                 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3629                         spin_unlock(&ctl->tree_lock);
3630                         mutex_unlock(&ctl->cache_writeout_mutex);
3631                         goto next;
3632                 }
3633
3634                 /*
3635                  * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3636                  * If X < @minlen, we won't trim X when we come back around.
3637                  * So trim it now.  We differ here from trimming extents as we
3638                  * don't keep individual state per bit.
3639                  */
3640                 if (async &&
3641                     max_discard_size &&
3642                     bytes > (max_discard_size + minlen))
3643                         bytes = max_discard_size;
3644
3645                 bitmap_clear_bits(ctl, entry, start, bytes);
3646                 if (entry->bytes == 0)
3647                         free_bitmap(ctl, entry);
3648
3649                 spin_unlock(&ctl->tree_lock);
3650                 trim_entry.start = start;
3651                 trim_entry.bytes = bytes;
3652                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3653                 mutex_unlock(&ctl->cache_writeout_mutex);
3654
3655                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3656                                   start, bytes, 0, &trim_entry);
3657                 if (ret) {
3658                         reset_trimming_bitmap(ctl, offset);
3659                         block_group->discard_cursor =
3660                                 btrfs_block_group_end(block_group);
3661                         break;
3662                 }
3663 next:
3664                 if (next_bitmap) {
3665                         offset += BITS_PER_BITMAP * ctl->unit;
3666                         start = offset;
3667                 } else {
3668                         start += bytes;
3669                 }
3670                 block_group->discard_cursor = start;
3671
3672                 if (fatal_signal_pending(current)) {
3673                         if (start != offset)
3674                                 reset_trimming_bitmap(ctl, offset);
3675                         ret = -ERESTARTSYS;
3676                         break;
3677                 }
3678
3679                 cond_resched();
3680         }
3681
3682         if (offset >= end)
3683                 block_group->discard_cursor = end;
3684
3685 out:
3686         return ret;
3687 }
3688
3689 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3690                            u64 *trimmed, u64 start, u64 end, u64 minlen)
3691 {
3692         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3693         int ret;
3694         u64 rem = 0;
3695
3696         *trimmed = 0;
3697
3698         spin_lock(&block_group->lock);
3699         if (block_group->removed) {
3700                 spin_unlock(&block_group->lock);
3701                 return 0;
3702         }
3703         btrfs_freeze_block_group(block_group);
3704         spin_unlock(&block_group->lock);
3705
3706         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3707         if (ret)
3708                 goto out;
3709
3710         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3711         div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3712         /* If we ended in the middle of a bitmap, reset the trimming flag */
3713         if (rem)
3714                 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3715 out:
3716         btrfs_unfreeze_block_group(block_group);
3717         return ret;
3718 }
3719
3720 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
3721                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
3722                                    bool async)
3723 {
3724         int ret;
3725
3726         *trimmed = 0;
3727
3728         spin_lock(&block_group->lock);
3729         if (block_group->removed) {
3730                 spin_unlock(&block_group->lock);
3731                 return 0;
3732         }
3733         btrfs_freeze_block_group(block_group);
3734         spin_unlock(&block_group->lock);
3735
3736         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3737         btrfs_unfreeze_block_group(block_group);
3738
3739         return ret;
3740 }
3741
3742 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
3743                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
3744                                    u64 maxlen, bool async)
3745 {
3746         int ret;
3747
3748         *trimmed = 0;
3749
3750         spin_lock(&block_group->lock);
3751         if (block_group->removed) {
3752                 spin_unlock(&block_group->lock);
3753                 return 0;
3754         }
3755         btrfs_freeze_block_group(block_group);
3756         spin_unlock(&block_group->lock);
3757
3758         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
3759                            async);
3760
3761         btrfs_unfreeze_block_group(block_group);
3762
3763         return ret;
3764 }
3765
3766 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3767 /*
3768  * Use this if you need to make a bitmap or extent entry specifically, it
3769  * doesn't do any of the merging that add_free_space does, this acts a lot like
3770  * how the free space cache loading stuff works, so you can get really weird
3771  * configurations.
3772  */
3773 int test_add_free_space_entry(struct btrfs_block_group *cache,
3774                               u64 offset, u64 bytes, bool bitmap)
3775 {
3776         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3777         struct btrfs_free_space *info = NULL, *bitmap_info;
3778         void *map = NULL;
3779         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
3780         u64 bytes_added;
3781         int ret;
3782
3783 again:
3784         if (!info) {
3785                 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3786                 if (!info)
3787                         return -ENOMEM;
3788         }
3789
3790         if (!bitmap) {
3791                 spin_lock(&ctl->tree_lock);
3792                 info->offset = offset;
3793                 info->bytes = bytes;
3794                 info->max_extent_size = 0;
3795                 ret = link_free_space(ctl, info);
3796                 spin_unlock(&ctl->tree_lock);
3797                 if (ret)
3798                         kmem_cache_free(btrfs_free_space_cachep, info);
3799                 return ret;
3800         }
3801
3802         if (!map) {
3803                 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
3804                 if (!map) {
3805                         kmem_cache_free(btrfs_free_space_cachep, info);
3806                         return -ENOMEM;
3807                 }
3808         }
3809
3810         spin_lock(&ctl->tree_lock);
3811         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3812                                          1, 0);
3813         if (!bitmap_info) {
3814                 info->bitmap = map;
3815                 map = NULL;
3816                 add_new_bitmap(ctl, info, offset);
3817                 bitmap_info = info;
3818                 info = NULL;
3819         }
3820
3821         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
3822                                           trim_state);
3823
3824         bytes -= bytes_added;
3825         offset += bytes_added;
3826         spin_unlock(&ctl->tree_lock);
3827
3828         if (bytes)
3829                 goto again;
3830
3831         if (info)
3832                 kmem_cache_free(btrfs_free_space_cachep, info);
3833         if (map)
3834                 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
3835         return 0;
3836 }
3837
3838 /*
3839  * Checks to see if the given range is in the free space cache.  This is really
3840  * just used to check the absence of space, so if there is free space in the
3841  * range at all we will return 1.
3842  */
3843 int test_check_exists(struct btrfs_block_group *cache,
3844                       u64 offset, u64 bytes)
3845 {
3846         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3847         struct btrfs_free_space *info;
3848         int ret = 0;
3849
3850         spin_lock(&ctl->tree_lock);
3851         info = tree_search_offset(ctl, offset, 0, 0);
3852         if (!info) {
3853                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3854                                           1, 0);
3855                 if (!info)
3856                         goto out;
3857         }
3858
3859 have_info:
3860         if (info->bitmap) {
3861                 u64 bit_off, bit_bytes;
3862                 struct rb_node *n;
3863                 struct btrfs_free_space *tmp;
3864
3865                 bit_off = offset;
3866                 bit_bytes = ctl->unit;
3867                 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3868                 if (!ret) {
3869                         if (bit_off == offset) {
3870                                 ret = 1;
3871                                 goto out;
3872                         } else if (bit_off > offset &&
3873                                    offset + bytes > bit_off) {
3874                                 ret = 1;
3875                                 goto out;
3876                         }
3877                 }
3878
3879                 n = rb_prev(&info->offset_index);
3880                 while (n) {
3881                         tmp = rb_entry(n, struct btrfs_free_space,
3882                                        offset_index);
3883                         if (tmp->offset + tmp->bytes < offset)
3884                                 break;
3885                         if (offset + bytes < tmp->offset) {
3886                                 n = rb_prev(&tmp->offset_index);
3887                                 continue;
3888                         }
3889                         info = tmp;
3890                         goto have_info;
3891                 }
3892
3893                 n = rb_next(&info->offset_index);
3894                 while (n) {
3895                         tmp = rb_entry(n, struct btrfs_free_space,
3896                                        offset_index);
3897                         if (offset + bytes < tmp->offset)
3898                                 break;
3899                         if (tmp->offset + tmp->bytes < offset) {
3900                                 n = rb_next(&tmp->offset_index);
3901                                 continue;
3902                         }
3903                         info = tmp;
3904                         goto have_info;
3905                 }
3906
3907                 ret = 0;
3908                 goto out;
3909         }
3910
3911         if (info->offset == offset) {
3912                 ret = 1;
3913                 goto out;
3914         }
3915
3916         if (offset > info->offset && offset < info->offset + info->bytes)
3917                 ret = 1;
3918 out:
3919         spin_unlock(&ctl->tree_lock);
3920         return ret;
3921 }
3922 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */