1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2008 Red Hat.  All rights reserved.
4 */
5
6#include <linux/pagemap.h>
7#include <linux/sched.h>
8#include <linux/sched/signal.h>
9#include <linux/slab.h>
10#include <linux/math64.h>
11#include <linux/ratelimit.h>
12#include <linux/error-injection.h>
13#include <linux/sched/mm.h>
14#include "ctree.h"
15#include "fs.h"
16#include "messages.h"
17#include "misc.h"
18#include "free-space-cache.h"
19#include "transaction.h"
20#include "disk-io.h"
21#include "extent_io.h"
22#include "volumes.h"
23#include "space-info.h"
24#include "delalloc-space.h"
25#include "block-group.h"
26#include "discard.h"
27#include "subpage.h"
28#include "inode-item.h"
29#include "accessors.h"
30#include "file-item.h"
31#include "file.h"
32#include "super.h"
33
34#define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
35#define MAX_CACHE_BYTES_PER_GIG	SZ_64K
36#define FORCE_EXTENT_THRESHOLD	SZ_1M
37
38static struct kmem_cache *btrfs_free_space_cachep;
39static struct kmem_cache *btrfs_free_space_bitmap_cachep;
40
41struct btrfs_trim_range {
42	u64 start;
43	u64 bytes;
44	struct list_head list;
45};
46
47static int link_free_space(struct btrfs_free_space_ctl *ctl,
48			   struct btrfs_free_space *info);
49static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
50			      struct btrfs_free_space *info, bool update_stat);
51static int search_bitmap(struct btrfs_free_space_ctl *ctl,
52			 struct btrfs_free_space *bitmap_info, u64 *offset,
53			 u64 *bytes, bool for_alloc);
54static void free_bitmap(struct btrfs_free_space_ctl *ctl,
55			struct btrfs_free_space *bitmap_info);
56static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
57			      struct btrfs_free_space *info, u64 offset,
58			      u64 bytes, bool update_stats);
59
60static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
61{
62	struct btrfs_free_space *info;
63	struct rb_node *node;
64
65	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
66		info = rb_entry(node, struct btrfs_free_space, offset_index);
67		if (!info->bitmap) {
68			unlink_free_space(ctl, info, true);
69			kmem_cache_free(btrfs_free_space_cachep, info);
70		} else {
71			free_bitmap(ctl, info);
72		}
73
74		cond_resched_lock(&ctl->tree_lock);
75	}
76}
77
78static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
79					       struct btrfs_path *path,
80					       u64 offset)
81{
82	struct btrfs_fs_info *fs_info = root->fs_info;
83	struct btrfs_key key;
84	struct btrfs_key location;
85	struct btrfs_disk_key disk_key;
86	struct btrfs_free_space_header *header;
87	struct extent_buffer *leaf;
88	struct inode *inode = NULL;
89	unsigned nofs_flag;
90	int ret;
91
92	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
93	key.offset = offset;
94	key.type = 0;
95
96	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
97	if (ret < 0)
98		return ERR_PTR(ret);
99	if (ret > 0) {
100		btrfs_release_path(path);
101		return ERR_PTR(-ENOENT);
102	}
103
104	leaf = path->nodes[0];
105	header = btrfs_item_ptr(leaf, path->slots[0],
106				struct btrfs_free_space_header);
107	btrfs_free_space_key(leaf, header, &disk_key);
108	btrfs_disk_key_to_cpu(&location, &disk_key);
109	btrfs_release_path(path);
110
111	/*
112	 * We are often under a trans handle at this point, so we need to make
113	 * sure NOFS is set to keep us from deadlocking.
114	 */
115	nofs_flag = memalloc_nofs_save();
116	inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
117	btrfs_release_path(path);
118	memalloc_nofs_restore(nofs_flag);
119	if (IS_ERR(inode))
120		return inode;
121
122	mapping_set_gfp_mask(inode->i_mapping,
123			mapping_gfp_constraint(inode->i_mapping,
124			~(__GFP_FS | __GFP_HIGHMEM)));
125
126	return inode;
127}
128
129struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
130		struct btrfs_path *path)
131{
132	struct btrfs_fs_info *fs_info = block_group->fs_info;
133	struct inode *inode = NULL;
134	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
135
136	spin_lock(&block_group->lock);
137	if (block_group->inode)
138		inode = igrab(block_group->inode);
139	spin_unlock(&block_group->lock);
140	if (inode)
141		return inode;
142
143	inode = __lookup_free_space_inode(fs_info->tree_root, path,
144					  block_group->start);
145	if (IS_ERR(inode))
146		return inode;
147
148	spin_lock(&block_group->lock);
149	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
150		btrfs_info(fs_info, "Old style space inode found, converting.");
151		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
152			BTRFS_INODE_NODATACOW;
153		block_group->disk_cache_state = BTRFS_DC_CLEAR;
154	}
155
156	if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
157		block_group->inode = igrab(inode);
158	spin_unlock(&block_group->lock);
159
160	return inode;
161}
162
163static int __create_free_space_inode(struct btrfs_root *root,
164				     struct btrfs_trans_handle *trans,
165				     struct btrfs_path *path,
166				     u64 ino, u64 offset)
167{
168	struct btrfs_key key;
169	struct btrfs_disk_key disk_key;
170	struct btrfs_free_space_header *header;
171	struct btrfs_inode_item *inode_item;
172	struct extent_buffer *leaf;
173	/* We inline CRCs for the free disk space cache */
174	const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
175			  BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
176	int ret;
177
178	ret = btrfs_insert_empty_inode(trans, root, path, ino);
179	if (ret)
180		return ret;
181
182	leaf = path->nodes[0];
183	inode_item = btrfs_item_ptr(leaf, path->slots[0],
184				    struct btrfs_inode_item);
185	btrfs_item_key(leaf, &disk_key, path->slots[0]);
186	memzero_extent_buffer(leaf, (unsigned long)inode_item,
187			     sizeof(*inode_item));
188	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
189	btrfs_set_inode_size(leaf, inode_item, 0);
190	btrfs_set_inode_nbytes(leaf, inode_item, 0);
191	btrfs_set_inode_uid(leaf, inode_item, 0);
192	btrfs_set_inode_gid(leaf, inode_item, 0);
193	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
194	btrfs_set_inode_flags(leaf, inode_item, flags);
195	btrfs_set_inode_nlink(leaf, inode_item, 1);
196	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
197	btrfs_set_inode_block_group(leaf, inode_item, offset);
198	btrfs_mark_buffer_dirty(trans, leaf);
199	btrfs_release_path(path);
200
201	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
202	key.offset = offset;
203	key.type = 0;
204	ret = btrfs_insert_empty_item(trans, root, path, &key,
205				      sizeof(struct btrfs_free_space_header));
206	if (ret < 0) {
207		btrfs_release_path(path);
208		return ret;
209	}
210
211	leaf = path->nodes[0];
212	header = btrfs_item_ptr(leaf, path->slots[0],
213				struct btrfs_free_space_header);
214	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
215	btrfs_set_free_space_key(leaf, header, &disk_key);
216	btrfs_mark_buffer_dirty(trans, leaf);
217	btrfs_release_path(path);
218
219	return 0;
220}
221
222int create_free_space_inode(struct btrfs_trans_handle *trans,
223			    struct btrfs_block_group *block_group,
224			    struct btrfs_path *path)
225{
226	int ret;
227	u64 ino;
228
229	ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
230	if (ret < 0)
231		return ret;
232
233	return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
234					 ino, block_group->start);
235}
236
237/*
238 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
239 * handles lookup, otherwise it takes ownership and iputs the inode.
240 * Don't reuse an inode pointer after passing it into this function.
241 */
242int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
243				  struct inode *inode,
244				  struct btrfs_block_group *block_group)
245{
246	struct btrfs_path *path;
247	struct btrfs_key key;
248	int ret = 0;
249
250	path = btrfs_alloc_path();
251	if (!path)
252		return -ENOMEM;
253
254	if (!inode)
255		inode = lookup_free_space_inode(block_group, path);
256	if (IS_ERR(inode)) {
257		if (PTR_ERR(inode) != -ENOENT)
258			ret = PTR_ERR(inode);
259		goto out;
260	}
261	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
262	if (ret) {
263		btrfs_add_delayed_iput(BTRFS_I(inode));
264		goto out;
265	}
266	clear_nlink(inode);
267	/* One for the block groups ref */
268	spin_lock(&block_group->lock);
269	if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
270		block_group->inode = NULL;
271		spin_unlock(&block_group->lock);
272		iput(inode);
273	} else {
274		spin_unlock(&block_group->lock);
275	}
276	/* One for the lookup ref */
277	btrfs_add_delayed_iput(BTRFS_I(inode));
278
279	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
280	key.type = 0;
281	key.offset = block_group->start;
282	ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
283				-1, 1);
284	if (ret) {
285		if (ret > 0)
286			ret = 0;
287		goto out;
288	}
289	ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
290out:
291	btrfs_free_path(path);
292	return ret;
293}
294
295int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
296				    struct btrfs_block_group *block_group,
297				    struct inode *vfs_inode)
298{
299	struct btrfs_truncate_control control = {
300		.inode = BTRFS_I(vfs_inode),
301		.new_size = 0,
302		.ino = btrfs_ino(BTRFS_I(vfs_inode)),
303		.min_type = BTRFS_EXTENT_DATA_KEY,
304		.clear_extent_range = true,
305	};
306	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
307	struct btrfs_root *root = inode->root;
308	struct extent_state *cached_state = NULL;
309	int ret = 0;
310	bool locked = false;
311
312	if (block_group) {
313		struct btrfs_path *path = btrfs_alloc_path();
314
315		if (!path) {
316			ret = -ENOMEM;
317			goto fail;
318		}
319		locked = true;
320		mutex_lock(&trans->transaction->cache_write_mutex);
321		if (!list_empty(&block_group->io_list)) {
322			list_del_init(&block_group->io_list);
323
324			btrfs_wait_cache_io(trans, block_group, path);
325			btrfs_put_block_group(block_group);
326		}
327
328		/*
329		 * now that we've truncated the cache away, its no longer
330		 * setup or written
331		 */
332		spin_lock(&block_group->lock);
333		block_group->disk_cache_state = BTRFS_DC_CLEAR;
334		spin_unlock(&block_group->lock);
335		btrfs_free_path(path);
336	}
337
338	btrfs_i_size_write(inode, 0);
339	truncate_pagecache(vfs_inode, 0);
340
341	lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
342	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
343
344	/*
345	 * We skip the throttling logic for free space cache inodes, so we don't
346	 * need to check for -EAGAIN.
347	 */
348	ret = btrfs_truncate_inode_items(trans, root, &control);
349
350	inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
351	btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
352
353	unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
354	if (ret)
355		goto fail;
356
357	ret = btrfs_update_inode(trans, root, inode);
358
359fail:
360	if (locked)
361		mutex_unlock(&trans->transaction->cache_write_mutex);
362	if (ret)
363		btrfs_abort_transaction(trans, ret);
364
365	return ret;
366}
367
368static void readahead_cache(struct inode *inode)
369{
370	struct file_ra_state ra;
371	unsigned long last_index;
372
373	file_ra_state_init(&ra, inode->i_mapping);
374	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
375
376	page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
377}
378
379static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
380		       int write)
381{
382	int num_pages;
383
384	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
385
386	/* Make sure we can fit our crcs and generation into the first page */
387	if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
388		return -ENOSPC;
389
390	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
391
392	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
393	if (!io_ctl->pages)
394		return -ENOMEM;
395
396	io_ctl->num_pages = num_pages;
397	io_ctl->fs_info = btrfs_sb(inode->i_sb);
398	io_ctl->inode = inode;
399
400	return 0;
401}
402ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
403
404static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
405{
406	kfree(io_ctl->pages);
407	io_ctl->pages = NULL;
408}
409
410static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
411{
412	if (io_ctl->cur) {
413		io_ctl->cur = NULL;
414		io_ctl->orig = NULL;
415	}
416}
417
418static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
419{
420	ASSERT(io_ctl->index < io_ctl->num_pages);
421	io_ctl->page = io_ctl->pages[io_ctl->index++];
422	io_ctl->cur = page_address(io_ctl->page);
423	io_ctl->orig = io_ctl->cur;
424	io_ctl->size = PAGE_SIZE;
425	if (clear)
426		clear_page(io_ctl->cur);
427}
428
429static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
430{
431	int i;
432
433	io_ctl_unmap_page(io_ctl);
434
435	for (i = 0; i < io_ctl->num_pages; i++) {
436		if (io_ctl->pages[i]) {
437			btrfs_page_clear_checked(io_ctl->fs_info,
438					io_ctl->pages[i],
439					page_offset(io_ctl->pages[i]),
440					PAGE_SIZE);
441			unlock_page(io_ctl->pages[i]);
442			put_page(io_ctl->pages[i]);
443		}
444	}
445}
446
447static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
448{
449	struct page *page;
450	struct inode *inode = io_ctl->inode;
451	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
452	int i;
453
454	for (i = 0; i < io_ctl->num_pages; i++) {
455		int ret;
456
457		page = find_or_create_page(inode->i_mapping, i, mask);
458		if (!page) {
459			io_ctl_drop_pages(io_ctl);
460			return -ENOMEM;
461		}
462
463		ret = set_page_extent_mapped(page);
464		if (ret < 0) {
465			unlock_page(page);
466			put_page(page);
467			io_ctl_drop_pages(io_ctl);
468			return ret;
469		}
470
471		io_ctl->pages[i] = page;
472		if (uptodate && !PageUptodate(page)) {
473			btrfs_read_folio(NULL, page_folio(page));
474			lock_page(page);
475			if (page->mapping != inode->i_mapping) {
476				btrfs_err(BTRFS_I(inode)->root->fs_info,
477					  "free space cache page truncated");
478				io_ctl_drop_pages(io_ctl);
479				return -EIO;
480			}
481			if (!PageUptodate(page)) {
482				btrfs_err(BTRFS_I(inode)->root->fs_info,
483					   "error reading free space cache");
484				io_ctl_drop_pages(io_ctl);
485				return -EIO;
486			}
487		}
488	}
489
490	for (i = 0; i < io_ctl->num_pages; i++)
491		clear_page_dirty_for_io(io_ctl->pages[i]);
492
493	return 0;
494}
495
496static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
497{
498	io_ctl_map_page(io_ctl, 1);
499
500	/*
501	 * Skip the csum areas.  If we don't check crcs then we just have a
502	 * 64bit chunk at the front of the first page.
503	 */
504	io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
505	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
506
507	put_unaligned_le64(generation, io_ctl->cur);
508	io_ctl->cur += sizeof(u64);
509}
510
511static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
512{
513	u64 cache_gen;
514
515	/*
516	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
517	 * chunk at the front of the first page.
518	 */
519	io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
520	io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
521
522	cache_gen = get_unaligned_le64(io_ctl->cur);
523	if (cache_gen != generation) {
524		btrfs_err_rl(io_ctl->fs_info,
525			"space cache generation (%llu) does not match inode (%llu)",
526				cache_gen, generation);
527		io_ctl_unmap_page(io_ctl);
528		return -EIO;
529	}
530	io_ctl->cur += sizeof(u64);
531	return 0;
532}
533
534static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
535{
536	u32 *tmp;
537	u32 crc = ~(u32)0;
538	unsigned offset = 0;
539
540	if (index == 0)
541		offset = sizeof(u32) * io_ctl->num_pages;
542
543	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
544	btrfs_crc32c_final(crc, (u8 *)&crc);
545	io_ctl_unmap_page(io_ctl);
546	tmp = page_address(io_ctl->pages[0]);
547	tmp += index;
548	*tmp = crc;
549}
550
551static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
552{
553	u32 *tmp, val;
554	u32 crc = ~(u32)0;
555	unsigned offset = 0;
556
557	if (index == 0)
558		offset = sizeof(u32) * io_ctl->num_pages;
559
560	tmp = page_address(io_ctl->pages[0]);
561	tmp += index;
562	val = *tmp;
563
564	io_ctl_map_page(io_ctl, 0);
565	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
566	btrfs_crc32c_final(crc, (u8 *)&crc);
567	if (val != crc) {
568		btrfs_err_rl(io_ctl->fs_info,
569			"csum mismatch on free space cache");
570		io_ctl_unmap_page(io_ctl);
571		return -EIO;
572	}
573
574	return 0;
575}
576
577static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
578			    void *bitmap)
579{
580	struct btrfs_free_space_entry *entry;
581
582	if (!io_ctl->cur)
583		return -ENOSPC;
584
585	entry = io_ctl->cur;
586	put_unaligned_le64(offset, &entry->offset);
587	put_unaligned_le64(bytes, &entry->bytes);
588	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
589		BTRFS_FREE_SPACE_EXTENT;
590	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
591	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
592
593	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
594		return 0;
595
596	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
597
598	/* No more pages to map */
599	if (io_ctl->index >= io_ctl->num_pages)
600		return 0;
601
602	/* map the next page */
603	io_ctl_map_page(io_ctl, 1);
604	return 0;
605}
606
607static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
608{
609	if (!io_ctl->cur)
610		return -ENOSPC;
611
612	/*
613	 * If we aren't at the start of the current page, unmap this one and
614	 * map the next one if there is any left.
615	 */
616	if (io_ctl->cur != io_ctl->orig) {
617		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
618		if (io_ctl->index >= io_ctl->num_pages)
619			return -ENOSPC;
620		io_ctl_map_page(io_ctl, 0);
621	}
622
623	copy_page(io_ctl->cur, bitmap);
624	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
625	if (io_ctl->index < io_ctl->num_pages)
626		io_ctl_map_page(io_ctl, 0);
627	return 0;
628}
629
630static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
631{
632	/*
633	 * If we're not on the boundary we know we've modified the page and we
634	 * need to crc the page.
635	 */
636	if (io_ctl->cur != io_ctl->orig)
637		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
638	else
639		io_ctl_unmap_page(io_ctl);
640
641	while (io_ctl->index < io_ctl->num_pages) {
642		io_ctl_map_page(io_ctl, 1);
643		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
644	}
645}
646
647static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
648			    struct btrfs_free_space *entry, u8 *type)
649{
650	struct btrfs_free_space_entry *e;
651	int ret;
652
653	if (!io_ctl->cur) {
654		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
655		if (ret)
656			return ret;
657	}
658
659	e = io_ctl->cur;
660	entry->offset = get_unaligned_le64(&e->offset);
661	entry->bytes = get_unaligned_le64(&e->bytes);
662	*type = e->type;
663	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
664	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
665
666	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
667		return 0;
668
669	io_ctl_unmap_page(io_ctl);
670
671	return 0;
672}
673
674static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
675			      struct btrfs_free_space *entry)
676{
677	int ret;
678
679	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
680	if (ret)
681		return ret;
682
683	copy_page(entry->bitmap, io_ctl->cur);
684	io_ctl_unmap_page(io_ctl);
685
686	return 0;
687}
688
689static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
690{
691	struct btrfs_block_group *block_group = ctl->block_group;
692	u64 max_bytes;
693	u64 bitmap_bytes;
694	u64 extent_bytes;
695	u64 size = block_group->length;
696	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
697	u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
698
699	max_bitmaps = max_t(u64, max_bitmaps, 1);
700
701	if (ctl->total_bitmaps > max_bitmaps)
702		btrfs_err(block_group->fs_info,
703"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
704			  block_group->start, block_group->length,
705			  ctl->total_bitmaps, ctl->unit, max_bitmaps,
706			  bytes_per_bg);
707	ASSERT(ctl->total_bitmaps <= max_bitmaps);
708
709	/*
710	 * We are trying to keep the total amount of memory used per 1GiB of
711	 * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
712	 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
713	 * bitmaps, we may end up using more memory than this.
714	 */
715	if (size < SZ_1G)
716		max_bytes = MAX_CACHE_BYTES_PER_GIG;
717	else
718		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
719
720	bitmap_bytes = ctl->total_bitmaps * ctl->unit;
721
722	/*
723	 * we want the extent entry threshold to always be at most 1/2 the max
724	 * bytes we can have, or whatever is less than that.
725	 */
726	extent_bytes = max_bytes - bitmap_bytes;
727	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
728
729	ctl->extents_thresh =
730		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
731}
732
733static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
734				   struct btrfs_free_space_ctl *ctl,
735				   struct btrfs_path *path, u64 offset)
736{
737	struct btrfs_fs_info *fs_info = root->fs_info;
738	struct btrfs_free_space_header *header;
739	struct extent_buffer *leaf;
740	struct btrfs_io_ctl io_ctl;
741	struct btrfs_key key;
742	struct btrfs_free_space *e, *n;
743	LIST_HEAD(bitmaps);
744	u64 num_entries;
745	u64 num_bitmaps;
746	u64 generation;
747	u8 type;
748	int ret = 0;
749
750	/* Nothing in the space cache, goodbye */
751	if (!i_size_read(inode))
752		return 0;
753
754	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
755	key.offset = offset;
756	key.type = 0;
757
758	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
759	if (ret < 0)
760		return 0;
761	else if (ret > 0) {
762		btrfs_release_path(path);
763		return 0;
764	}
765
766	ret = -1;
767
768	leaf = path->nodes[0];
769	header = btrfs_item_ptr(leaf, path->slots[0],
770				struct btrfs_free_space_header);
771	num_entries = btrfs_free_space_entries(leaf, header);
772	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
773	generation = btrfs_free_space_generation(leaf, header);
774	btrfs_release_path(path);
775
776	if (!BTRFS_I(inode)->generation) {
777		btrfs_info(fs_info,
778			   "the free space cache file (%llu) is invalid, skip it",
779			   offset);
780		return 0;
781	}
782
783	if (BTRFS_I(inode)->generation != generation) {
784		btrfs_err(fs_info,
785			  "free space inode generation (%llu) did not match free space cache generation (%llu)",
786			  BTRFS_I(inode)->generation, generation);
787		return 0;
788	}
789
790	if (!num_entries)
791		return 0;
792
793	ret = io_ctl_init(&io_ctl, inode, 0);
794	if (ret)
795		return ret;
796
797	readahead_cache(inode);
798
799	ret = io_ctl_prepare_pages(&io_ctl, true);
800	if (ret)
801		goto out;
802
803	ret = io_ctl_check_crc(&io_ctl, 0);
804	if (ret)
805		goto free_cache;
806
807	ret = io_ctl_check_generation(&io_ctl, generation);
808	if (ret)
809		goto free_cache;
810
811	while (num_entries) {
812		e = kmem_cache_zalloc(btrfs_free_space_cachep,
813				      GFP_NOFS);
814		if (!e) {
815			ret = -ENOMEM;
816			goto free_cache;
817		}
818
819		ret = io_ctl_read_entry(&io_ctl, e, &type);
820		if (ret) {
821			kmem_cache_free(btrfs_free_space_cachep, e);
822			goto free_cache;
823		}
824
825		if (!e->bytes) {
826			ret = -1;
827			kmem_cache_free(btrfs_free_space_cachep, e);
828			goto free_cache;
829		}
830
831		if (type == BTRFS_FREE_SPACE_EXTENT) {
832			spin_lock(&ctl->tree_lock);
833			ret = link_free_space(ctl, e);
834			spin_unlock(&ctl->tree_lock);
835			if (ret) {
836				btrfs_err(fs_info,
837					"Duplicate entries in free space cache, dumping");
838				kmem_cache_free(btrfs_free_space_cachep, e);
839				goto free_cache;
840			}
841		} else {
842			ASSERT(num_bitmaps);
843			num_bitmaps--;
844			e->bitmap = kmem_cache_zalloc(
845					btrfs_free_space_bitmap_cachep, GFP_NOFS);
846			if (!e->bitmap) {
847				ret = -ENOMEM;
848				kmem_cache_free(
849					btrfs_free_space_cachep, e);
850				goto free_cache;
851			}
852			spin_lock(&ctl->tree_lock);
853			ret = link_free_space(ctl, e);
854			if (ret) {
855				spin_unlock(&ctl->tree_lock);
856				btrfs_err(fs_info,
857					"Duplicate entries in free space cache, dumping");
858				kmem_cache_free(btrfs_free_space_cachep, e);
859				goto free_cache;
860			}
861			ctl->total_bitmaps++;
862			recalculate_thresholds(ctl);
863			spin_unlock(&ctl->tree_lock);
864			list_add_tail(&e->list, &bitmaps);
865		}
866
867		num_entries--;
868	}
869
870	io_ctl_unmap_page(&io_ctl);
871
872	/*
873	 * We add the bitmaps at the end of the entries in order that
874	 * the bitmap entries are added to the cache.
875	 */
876	list_for_each_entry_safe(e, n, &bitmaps, list) {
877		list_del_init(&e->list);
878		ret = io_ctl_read_bitmap(&io_ctl, e);
879		if (ret)
880			goto free_cache;
881	}
882
883	io_ctl_drop_pages(&io_ctl);
884	ret = 1;
885out:
886	io_ctl_free(&io_ctl);
887	return ret;
888free_cache:
889	io_ctl_drop_pages(&io_ctl);
890
891	spin_lock(&ctl->tree_lock);
892	__btrfs_remove_free_space_cache(ctl);
893	spin_unlock(&ctl->tree_lock);
894	goto out;
895}
896
897static int copy_free_space_cache(struct btrfs_block_group *block_group,
898				 struct btrfs_free_space_ctl *ctl)
899{
900	struct btrfs_free_space *info;
901	struct rb_node *n;
902	int ret = 0;
903
904	while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
905		info = rb_entry(n, struct btrfs_free_space, offset_index);
906		if (!info->bitmap) {
907			const u64 offset = info->offset;
908			const u64 bytes = info->bytes;
909
910			unlink_free_space(ctl, info, true);
911			spin_unlock(&ctl->tree_lock);
912			kmem_cache_free(btrfs_free_space_cachep, info);
913			ret = btrfs_add_free_space(block_group, offset, bytes);
914			spin_lock(&ctl->tree_lock);
915		} else {
916			u64 offset = info->offset;
917			u64 bytes = ctl->unit;
918
919			ret = search_bitmap(ctl, info, &offset, &bytes, false);
920			if (ret == 0) {
921				bitmap_clear_bits(ctl, info, offset, bytes, true);
922				spin_unlock(&ctl->tree_lock);
923				ret = btrfs_add_free_space(block_group, offset,
924							   bytes);
925				spin_lock(&ctl->tree_lock);
926			} else {
927				free_bitmap(ctl, info);
928				ret = 0;
929			}
930		}
931		cond_resched_lock(&ctl->tree_lock);
932	}
933	return ret;
934}
935
936static struct lock_class_key btrfs_free_space_inode_key;
937
938int load_free_space_cache(struct btrfs_block_group *block_group)
939{
940	struct btrfs_fs_info *fs_info = block_group->fs_info;
941	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
942	struct btrfs_free_space_ctl tmp_ctl = {};
943	struct inode *inode;
944	struct btrfs_path *path;
945	int ret = 0;
946	bool matched;
947	u64 used = block_group->used;
948
949	/*
950	 * Because we could potentially discard our loaded free space, we want
951	 * to load everything into a temporary structure first, and then if it's
952	 * valid copy it all into the actual free space ctl.
953	 */
954	btrfs_init_free_space_ctl(block_group, &tmp_ctl);
955
956	/*
957	 * If this block group has been marked to be cleared for one reason or
958	 * another then we can't trust the on disk cache, so just return.
959	 */
960	spin_lock(&block_group->lock);
961	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
962		spin_unlock(&block_group->lock);
963		return 0;
964	}
965	spin_unlock(&block_group->lock);
966
967	path = btrfs_alloc_path();
968	if (!path)
969		return 0;
970	path->search_commit_root = 1;
971	path->skip_locking = 1;
972
973	/*
974	 * We must pass a path with search_commit_root set to btrfs_iget in
975	 * order to avoid a deadlock when allocating extents for the tree root.
976	 *
977	 * When we are COWing an extent buffer from the tree root, when looking
978	 * for a free extent, at extent-tree.c:find_free_extent(), we can find
979	 * block group without its free space cache loaded. When we find one
980	 * we must load its space cache which requires reading its free space
981	 * cache's inode item from the root tree. If this inode item is located
982	 * in the same leaf that we started COWing before, then we end up in
983	 * deadlock on the extent buffer (trying to read lock it when we
984	 * previously write locked it).
985	 *
986	 * It's safe to read the inode item using the commit root because
987	 * block groups, once loaded, stay in memory forever (until they are
988	 * removed) as well as their space caches once loaded. New block groups
989	 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
990	 * we will never try to read their inode item while the fs is mounted.
991	 */
992	inode = lookup_free_space_inode(block_group, path);
993	if (IS_ERR(inode)) {
994		btrfs_free_path(path);
995		return 0;
996	}
997
998	/* We may have converted the inode and made the cache invalid. */
999	spin_lock(&block_group->lock);
1000	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
1001		spin_unlock(&block_group->lock);
1002		btrfs_free_path(path);
1003		goto out;
1004	}
1005	spin_unlock(&block_group->lock);
1006
1007	/*
1008	 * Reinitialize the class of struct inode's mapping->invalidate_lock for
1009	 * free space inodes to prevent false positives related to locks for normal
1010	 * inodes.
1011	 */
1012	lockdep_set_class(&(&inode->i_data)->invalidate_lock,
1013			  &btrfs_free_space_inode_key);
1014
1015	ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
1016				      path, block_group->start);
1017	btrfs_free_path(path);
1018	if (ret <= 0)
1019		goto out;
1020
1021	matched = (tmp_ctl.free_space == (block_group->length - used -
1022					  block_group->bytes_super));
1023
1024	if (matched) {
1025		spin_lock(&tmp_ctl.tree_lock);
1026		ret = copy_free_space_cache(block_group, &tmp_ctl);
1027		spin_unlock(&tmp_ctl.tree_lock);
1028		/*
1029		 * ret == 1 means we successfully loaded the free space cache,
1030		 * so we need to re-set it here.
1031		 */
1032		if (ret == 0)
1033			ret = 1;
1034	} else {
1035		/*
1036		 * We need to call the _locked variant so we don't try to update
1037		 * the discard counters.
1038		 */
1039		spin_lock(&tmp_ctl.tree_lock);
1040		__btrfs_remove_free_space_cache(&tmp_ctl);
1041		spin_unlock(&tmp_ctl.tree_lock);
1042		btrfs_warn(fs_info,
1043			   "block group %llu has wrong amount of free space",
1044			   block_group->start);
1045		ret = -1;
1046	}
1047out:
1048	if (ret < 0) {
1049		/* This cache is bogus, make sure it gets cleared */
1050		spin_lock(&block_group->lock);
1051		block_group->disk_cache_state = BTRFS_DC_CLEAR;
1052		spin_unlock(&block_group->lock);
1053		ret = 0;
1054
1055		btrfs_warn(fs_info,
1056			   "failed to load free space cache for block group %llu, rebuilding it now",
1057			   block_group->start);
1058	}
1059
1060	spin_lock(&ctl->tree_lock);
1061	btrfs_discard_update_discardable(block_group);
1062	spin_unlock(&ctl->tree_lock);
1063	iput(inode);
1064	return ret;
1065}
1066
1067static noinline_for_stack
1068int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1069			      struct btrfs_free_space_ctl *ctl,
1070			      struct btrfs_block_group *block_group,
1071			      int *entries, int *bitmaps,
1072			      struct list_head *bitmap_list)
1073{
1074	int ret;
1075	struct btrfs_free_cluster *cluster = NULL;
1076	struct btrfs_free_cluster *cluster_locked = NULL;
1077	struct rb_node *node = rb_first(&ctl->free_space_offset);
1078	struct btrfs_trim_range *trim_entry;
1079
1080	/* Get the cluster for this block_group if it exists */
1081	if (block_group && !list_empty(&block_group->cluster_list)) {
1082		cluster = list_entry(block_group->cluster_list.next,
1083				     struct btrfs_free_cluster,
1084				     block_group_list);
1085	}
1086
1087	if (!node && cluster) {
1088		cluster_locked = cluster;
1089		spin_lock(&cluster_locked->lock);
1090		node = rb_first(&cluster->root);
1091		cluster = NULL;
1092	}
1093
1094	/* Write out the extent entries */
1095	while (node) {
1096		struct btrfs_free_space *e;
1097
1098		e = rb_entry(node, struct btrfs_free_space, offset_index);
1099		*entries += 1;
1100
1101		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1102				       e->bitmap);
1103		if (ret)
1104			goto fail;
1105
1106		if (e->bitmap) {
1107			list_add_tail(&e->list, bitmap_list);
1108			*bitmaps += 1;
1109		}
1110		node = rb_next(node);
1111		if (!node && cluster) {
1112			node = rb_first(&cluster->root);
1113			cluster_locked = cluster;
1114			spin_lock(&cluster_locked->lock);
1115			cluster = NULL;
1116		}
1117	}
1118	if (cluster_locked) {
1119		spin_unlock(&cluster_locked->lock);
1120		cluster_locked = NULL;
1121	}
1122
1123	/*
1124	 * Make sure we don't miss any range that was removed from our rbtree
1125	 * because trimming is running. Otherwise after a umount+mount (or crash
1126	 * after committing the transaction) we would leak free space and get
1127	 * an inconsistent free space cache report from fsck.
1128	 */
1129	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1130		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1131				       trim_entry->bytes, NULL);
1132		if (ret)
1133			goto fail;
1134		*entries += 1;
1135	}
1136
1137	return 0;
1138fail:
1139	if (cluster_locked)
1140		spin_unlock(&cluster_locked->lock);
1141	return -ENOSPC;
1142}
1143
1144static noinline_for_stack int
1145update_cache_item(struct btrfs_trans_handle *trans,
1146		  struct btrfs_root *root,
1147		  struct inode *inode,
1148		  struct btrfs_path *path, u64 offset,
1149		  int entries, int bitmaps)
1150{
1151	struct btrfs_key key;
1152	struct btrfs_free_space_header *header;
1153	struct extent_buffer *leaf;
1154	int ret;
1155
1156	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1157	key.offset = offset;
1158	key.type = 0;
1159
1160	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1161	if (ret < 0) {
1162		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1163				 EXTENT_DELALLOC, NULL);
1164		goto fail;
1165	}
1166	leaf = path->nodes[0];
1167	if (ret > 0) {
1168		struct btrfs_key found_key;
1169		ASSERT(path->slots[0]);
1170		path->slots[0]--;
1171		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1172		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1173		    found_key.offset != offset) {
1174			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1175					 inode->i_size - 1, EXTENT_DELALLOC,
1176					 NULL);
1177			btrfs_release_path(path);
1178			goto fail;
1179		}
1180	}
1181
1182	BTRFS_I(inode)->generation = trans->transid;
1183	header = btrfs_item_ptr(leaf, path->slots[0],
1184				struct btrfs_free_space_header);
1185	btrfs_set_free_space_entries(leaf, header, entries);
1186	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1187	btrfs_set_free_space_generation(leaf, header, trans->transid);
1188	btrfs_mark_buffer_dirty(trans, leaf);
1189	btrfs_release_path(path);
1190
1191	return 0;
1192
1193fail:
1194	return -1;
1195}
1196
1197static noinline_for_stack int write_pinned_extent_entries(
1198			    struct btrfs_trans_handle *trans,
1199			    struct btrfs_block_group *block_group,
1200			    struct btrfs_io_ctl *io_ctl,
1201			    int *entries)
1202{
1203	u64 start, extent_start, extent_end, len;
1204	struct extent_io_tree *unpin = NULL;
1205	int ret;
1206
1207	if (!block_group)
1208		return 0;
1209
1210	/*
1211	 * We want to add any pinned extents to our free space cache
1212	 * so we don't leak the space
1213	 *
1214	 * We shouldn't have switched the pinned extents yet so this is the
1215	 * right one
1216	 */
1217	unpin = &trans->transaction->pinned_extents;
1218
1219	start = block_group->start;
1220
1221	while (start < block_group->start + block_group->length) {
1222		if (!find_first_extent_bit(unpin, start,
1223					   &extent_start, &extent_end,
1224					   EXTENT_DIRTY, NULL))
1225			return 0;
1226
1227		/* This pinned extent is out of our range */
1228		if (extent_start >= block_group->start + block_group->length)
1229			return 0;
1230
1231		extent_start = max(extent_start, start);
1232		extent_end = min(block_group->start + block_group->length,
1233				 extent_end + 1);
1234		len = extent_end - extent_start;
1235
1236		*entries += 1;
1237		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1238		if (ret)
1239			return -ENOSPC;
1240
1241		start = extent_end;
1242	}
1243
1244	return 0;
1245}
1246
1247static noinline_for_stack int
1248write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1249{
1250	struct btrfs_free_space *entry, *next;
1251	int ret;
1252
1253	/* Write out the bitmaps */
1254	list_for_each_entry_safe(entry, next, bitmap_list, list) {
1255		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1256		if (ret)
1257			return -ENOSPC;
1258		list_del_init(&entry->list);
1259	}
1260
1261	return 0;
1262}
1263
1264static int flush_dirty_cache(struct inode *inode)
1265{
1266	int ret;
1267
1268	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1269	if (ret)
1270		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1271				 EXTENT_DELALLOC, NULL);
1272
1273	return ret;
1274}
1275
1276static void noinline_for_stack
1277cleanup_bitmap_list(struct list_head *bitmap_list)
1278{
1279	struct btrfs_free_space *entry, *next;
1280
1281	list_for_each_entry_safe(entry, next, bitmap_list, list)
1282		list_del_init(&entry->list);
1283}
1284
1285static void noinline_for_stack
1286cleanup_write_cache_enospc(struct inode *inode,
1287			   struct btrfs_io_ctl *io_ctl,
1288			   struct extent_state **cached_state)
1289{
1290	io_ctl_drop_pages(io_ctl);
1291	unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1292		      cached_state);
1293}
1294
1295static int __btrfs_wait_cache_io(struct btrfs_root *root,
1296				 struct btrfs_trans_handle *trans,
1297				 struct btrfs_block_group *block_group,
1298				 struct btrfs_io_ctl *io_ctl,
1299				 struct btrfs_path *path, u64 offset)
1300{
1301	int ret;
1302	struct inode *inode = io_ctl->inode;
1303
1304	if (!inode)
1305		return 0;
1306
1307	/* Flush the dirty pages in the cache file. */
1308	ret = flush_dirty_cache(inode);
1309	if (ret)
1310		goto out;
1311
1312	/* Update the cache item to tell everyone this cache file is valid. */
1313	ret = update_cache_item(trans, root, inode, path, offset,
1314				io_ctl->entries, io_ctl->bitmaps);
1315out:
1316	if (ret) {
1317		invalidate_inode_pages2(inode->i_mapping);
1318		BTRFS_I(inode)->generation = 0;
1319		if (block_group)
1320			btrfs_debug(root->fs_info,
1321	  "failed to write free space cache for block group %llu error %d",
1322				  block_group->start, ret);
1323	}
1324	btrfs_update_inode(trans, root, BTRFS_I(inode));
1325
1326	if (block_group) {
1327		/* the dirty list is protected by the dirty_bgs_lock */
1328		spin_lock(&trans->transaction->dirty_bgs_lock);
1329
1330		/* the disk_cache_state is protected by the block group lock */
1331		spin_lock(&block_group->lock);
1332
1333		/*
1334		 * only mark this as written if we didn't get put back on
1335		 * the dirty list while waiting for IO.   Otherwise our
1336		 * cache state won't be right, and we won't get written again
1337		 */
1338		if (!ret && list_empty(&block_group->dirty_list))
1339			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1340		else if (ret)
1341			block_group->disk_cache_state = BTRFS_DC_ERROR;
1342
1343		spin_unlock(&block_group->lock);
1344		spin_unlock(&trans->transaction->dirty_bgs_lock);
1345		io_ctl->inode = NULL;
1346		iput(inode);
1347	}
1348
1349	return ret;
1350
1351}
1352
1353int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1354			struct btrfs_block_group *block_group,
1355			struct btrfs_path *path)
1356{
1357	return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1358				     block_group, &block_group->io_ctl,
1359				     path, block_group->start);
1360}
1361
1362/*
1363 * Write out cached info to an inode.
1364 *
1365 * @root:        root the inode belongs to
1366 * @inode:       freespace inode we are writing out
1367 * @ctl:         free space cache we are going to write out
1368 * @block_group: block_group for this cache if it belongs to a block_group
1369 * @io_ctl:      holds context for the io
1370 * @trans:       the trans handle
1371 *
1372 * This function writes out a free space cache struct to disk for quick recovery
1373 * on mount.  This will return 0 if it was successful in writing the cache out,
1374 * or an errno if it was not.
1375 */
1376static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1377				   struct btrfs_free_space_ctl *ctl,
1378				   struct btrfs_block_group *block_group,
1379				   struct btrfs_io_ctl *io_ctl,
1380				   struct btrfs_trans_handle *trans)
1381{
1382	struct extent_state *cached_state = NULL;
1383	LIST_HEAD(bitmap_list);
1384	int entries = 0;
1385	int bitmaps = 0;
1386	int ret;
1387	int must_iput = 0;
1388
1389	if (!i_size_read(inode))
1390		return -EIO;
1391
1392	WARN_ON(io_ctl->pages);
1393	ret = io_ctl_init(io_ctl, inode, 1);
1394	if (ret)
1395		return ret;
1396
1397	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1398		down_write(&block_group->data_rwsem);
1399		spin_lock(&block_group->lock);
1400		if (block_group->delalloc_bytes) {
1401			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1402			spin_unlock(&block_group->lock);
1403			up_write(&block_group->data_rwsem);
1404			BTRFS_I(inode)->generation = 0;
1405			ret = 0;
1406			must_iput = 1;
1407			goto out;
1408		}
1409		spin_unlock(&block_group->lock);
1410	}
1411
1412	/* Lock all pages first so we can lock the extent safely. */
1413	ret = io_ctl_prepare_pages(io_ctl, false);
1414	if (ret)
1415		goto out_unlock;
1416
1417	lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1418		    &cached_state);
1419
1420	io_ctl_set_generation(io_ctl, trans->transid);
1421
1422	mutex_lock(&ctl->cache_writeout_mutex);
1423	/* Write out the extent entries in the free space cache */
1424	spin_lock(&ctl->tree_lock);
1425	ret = write_cache_extent_entries(io_ctl, ctl,
1426					 block_group, &entries, &bitmaps,
1427					 &bitmap_list);
1428	if (ret)
1429		goto out_nospc_locked;
1430
1431	/*
1432	 * Some spaces that are freed in the current transaction are pinned,
1433	 * they will be added into free space cache after the transaction is
1434	 * committed, we shouldn't lose them.
1435	 *
1436	 * If this changes while we are working we'll get added back to
1437	 * the dirty list and redo it.  No locking needed
1438	 */
1439	ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1440	if (ret)
1441		goto out_nospc_locked;
1442
1443	/*
1444	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1445	 * locked while doing it because a concurrent trim can be manipulating
1446	 * or freeing the bitmap.
1447	 */
1448	ret = write_bitmap_entries(io_ctl, &bitmap_list);
1449	spin_unlock(&ctl->tree_lock);
1450	mutex_unlock(&ctl->cache_writeout_mutex);
1451	if (ret)
1452		goto out_nospc;
1453
1454	/* Zero out the rest of the pages just to make sure */
1455	io_ctl_zero_remaining_pages(io_ctl);
1456
1457	/* Everything is written out, now we dirty the pages in the file. */
1458	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1459				io_ctl->num_pages, 0, i_size_read(inode),
1460				&cached_state, false);
1461	if (ret)
1462		goto out_nospc;
1463
1464	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1465		up_write(&block_group->data_rwsem);
1466	/*
1467	 * Release the pages and unlock the extent, we will flush
1468	 * them out later
1469	 */
1470	io_ctl_drop_pages(io_ctl);
1471	io_ctl_free(io_ctl);
1472
1473	unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1474		      &cached_state);
1475
1476	/*
1477	 * at this point the pages are under IO and we're happy,
1478	 * The caller is responsible for waiting on them and updating
1479	 * the cache and the inode
1480	 */
1481	io_ctl->entries = entries;
1482	io_ctl->bitmaps = bitmaps;
1483
1484	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1485	if (ret)
1486		goto out;
1487
1488	return 0;
1489
1490out_nospc_locked:
1491	cleanup_bitmap_list(&bitmap_list);
1492	spin_unlock(&ctl->tree_lock);
1493	mutex_unlock(&ctl->cache_writeout_mutex);
1494
1495out_nospc:
1496	cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1497
1498out_unlock:
1499	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1500		up_write(&block_group->data_rwsem);
1501
1502out:
1503	io_ctl->inode = NULL;
1504	io_ctl_free(io_ctl);
1505	if (ret) {
1506		invalidate_inode_pages2(inode->i_mapping);
1507		BTRFS_I(inode)->generation = 0;
1508	}
1509	btrfs_update_inode(trans, root, BTRFS_I(inode));
1510	if (must_iput)
1511		iput(inode);
1512	return ret;
1513}
1514
1515int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1516			  struct btrfs_block_group *block_group,
1517			  struct btrfs_path *path)
1518{
1519	struct btrfs_fs_info *fs_info = trans->fs_info;
1520	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1521	struct inode *inode;
1522	int ret = 0;
1523
1524	spin_lock(&block_group->lock);
1525	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1526		spin_unlock(&block_group->lock);
1527		return 0;
1528	}
1529	spin_unlock(&block_group->lock);
1530
1531	inode = lookup_free_space_inode(block_group, path);
1532	if (IS_ERR(inode))
1533		return 0;
1534
1535	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1536				block_group, &block_group->io_ctl, trans);
1537	if (ret) {
1538		btrfs_debug(fs_info,
1539	  "failed to write free space cache for block group %llu error %d",
1540			  block_group->start, ret);
1541		spin_lock(&block_group->lock);
1542		block_group->disk_cache_state = BTRFS_DC_ERROR;
1543		spin_unlock(&block_group->lock);
1544
1545		block_group->io_ctl.inode = NULL;
1546		iput(inode);
1547	}
1548
1549	/*
1550	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1551	 * to wait for IO and put the inode
1552	 */
1553
1554	return ret;
1555}
1556
1557static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1558					  u64 offset)
1559{
1560	ASSERT(offset >= bitmap_start);
1561	offset -= bitmap_start;
1562	return (unsigned long)(div_u64(offset, unit));
1563}
1564
1565static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1566{
1567	return (unsigned long)(div_u64(bytes, unit));
1568}
1569
1570static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1571				   u64 offset)
1572{
1573	u64 bitmap_start;
1574	u64 bytes_per_bitmap;
1575
1576	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1577	bitmap_start = offset - ctl->start;
1578	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1579	bitmap_start *= bytes_per_bitmap;
1580	bitmap_start += ctl->start;
1581
1582	return bitmap_start;
1583}
1584
1585static int tree_insert_offset(struct btrfs_free_space_ctl *ctl,
1586			      struct btrfs_free_cluster *cluster,
1587			      struct btrfs_free_space *new_entry)
1588{
1589	struct rb_root *root;
1590	struct rb_node **p;
1591	struct rb_node *parent = NULL;
1592
1593	lockdep_assert_held(&ctl->tree_lock);
1594
1595	if (cluster) {
1596		lockdep_assert_held(&cluster->lock);
1597		root = &cluster->root;
1598	} else {
1599		root = &ctl->free_space_offset;
1600	}
1601
1602	p = &root->rb_node;
1603
1604	while (*p) {
1605		struct btrfs_free_space *info;
1606
1607		parent = *p;
1608		info = rb_entry(parent, struct btrfs_free_space, offset_index);
1609
1610		if (new_entry->offset < info->offset) {
1611			p = &(*p)->rb_left;
1612		} else if (new_entry->offset > info->offset) {
1613			p = &(*p)->rb_right;
1614		} else {
1615			/*
1616			 * we could have a bitmap entry and an extent entry
1617			 * share the same offset.  If this is the case, we want
1618			 * the extent entry to always be found first if we do a
1619			 * linear search through the tree, since we want to have
1620			 * the quickest allocation time, and allocating from an
1621			 * extent is faster than allocating from a bitmap.  So
1622			 * if we're inserting a bitmap and we find an entry at
1623			 * this offset, we want to go right, or after this entry
1624			 * logically.  If we are inserting an extent and we've
1625			 * found a bitmap, we want to go left, or before
1626			 * logically.
1627			 */
1628			if (new_entry->bitmap) {
1629				if (info->bitmap) {
1630					WARN_ON_ONCE(1);
1631					return -EEXIST;
1632				}
1633				p = &(*p)->rb_right;
1634			} else {
1635				if (!info->bitmap) {
1636					WARN_ON_ONCE(1);
1637					return -EEXIST;
1638				}
1639				p = &(*p)->rb_left;
1640			}
1641		}
1642	}
1643
1644	rb_link_node(&new_entry->offset_index, parent, p);
1645	rb_insert_color(&new_entry->offset_index, root);
1646
1647	return 0;
1648}
1649
1650/*
1651 * This is a little subtle.  We *only* have ->max_extent_size set if we actually
1652 * searched through the bitmap and figured out the largest ->max_extent_size,
1653 * otherwise it's 0.  In the case that it's 0 we don't want to tell the
1654 * allocator the wrong thing, we want to use the actual real max_extent_size
1655 * we've found already if it's larger, or we want to use ->bytes.
1656 *
1657 * This matters because find_free_space() will skip entries who's ->bytes is
1658 * less than the required bytes.  So if we didn't search down this bitmap, we
1659 * may pick some previous entry that has a smaller ->max_extent_size than we
1660 * have.  For example, assume we have two entries, one that has
1661 * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
1662 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
1663 *  call into find_free_space(), and return with max_extent_size == 4K, because
1664 *  that first bitmap entry had ->max_extent_size set, but the second one did
1665 *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
1666 *  8K contiguous range.
1667 *
1668 *  Consider the other case, we have 2 8K chunks in that second entry and still
1669 *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
1670 *  allocator comes in it'll fully search our second bitmap, and this time it'll
1671 *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
1672 *  right allocation the next loop through.
1673 */
1674static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
1675{
1676	if (entry->bitmap && entry->max_extent_size)
1677		return entry->max_extent_size;
1678	return entry->bytes;
1679}
1680
1681/*
1682 * We want the largest entry to be leftmost, so this is inverted from what you'd
1683 * normally expect.
1684 */
1685static bool entry_less(struct rb_node *node, const struct rb_node *parent)
1686{
1687	const struct btrfs_free_space *entry, *exist;
1688
1689	entry = rb_entry(node, struct btrfs_free_space, bytes_index);
1690	exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
1691	return get_max_extent_size(exist) < get_max_extent_size(entry);
1692}
1693
1694/*
1695 * searches the tree for the given offset.
1696 *
1697 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1698 * want a section that has at least bytes size and comes at or after the given
1699 * offset.
1700 */
1701static struct btrfs_free_space *
1702tree_search_offset(struct btrfs_free_space_ctl *ctl,
1703		   u64 offset, int bitmap_only, int fuzzy)
1704{
1705	struct rb_node *n = ctl->free_space_offset.rb_node;
1706	struct btrfs_free_space *entry = NULL, *prev = NULL;
1707
1708	lockdep_assert_held(&ctl->tree_lock);
1709
1710	/* find entry that is closest to the 'offset' */
1711	while (n) {
1712		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1713		prev = entry;
1714
1715		if (offset < entry->offset)
1716			n = n->rb_left;
1717		else if (offset > entry->offset)
1718			n = n->rb_right;
1719		else
1720			break;
1721
1722		entry = NULL;
1723	}
1724
1725	if (bitmap_only) {
1726		if (!entry)
1727			return NULL;
1728		if (entry->bitmap)
1729			return entry;
1730
1731		/*
1732		 * bitmap entry and extent entry may share same offset,
1733		 * in that case, bitmap entry comes after extent entry.
1734		 */
1735		n = rb_next(n);
1736		if (!n)
1737			return NULL;
1738		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1739		if (entry->offset != offset)
1740			return NULL;
1741
1742		WARN_ON(!entry->bitmap);
1743		return entry;
1744	} else if (entry) {
1745		if (entry->bitmap) {
1746			/*
1747			 * if previous extent entry covers the offset,
1748			 * we should return it instead of the bitmap entry
1749			 */
1750			n = rb_prev(&entry->offset_index);
1751			if (n) {
1752				prev = rb_entry(n, struct btrfs_free_space,
1753						offset_index);
1754				if (!prev->bitmap &&
1755				    prev->offset + prev->bytes > offset)
1756					entry = prev;
1757			}
1758		}
1759		return entry;
1760	}
1761
1762	if (!prev)
1763		return NULL;
1764
1765	/* find last entry before the 'offset' */
1766	entry = prev;
1767	if (entry->offset > offset) {
1768		n = rb_prev(&entry->offset_index);
1769		if (n) {
1770			entry = rb_entry(n, struct btrfs_free_space,
1771					offset_index);
1772			ASSERT(entry->offset <= offset);
1773		} else {
1774			if (fuzzy)
1775				return entry;
1776			else
1777				return NULL;
1778		}
1779	}
1780
1781	if (entry->bitmap) {
1782		n = rb_prev(&entry->offset_index);
1783		if (n) {
1784			prev = rb_entry(n, struct btrfs_free_space,
1785					offset_index);
1786			if (!prev->bitmap &&
1787			    prev->offset + prev->bytes > offset)
1788				return prev;
1789		}
1790		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1791			return entry;
1792	} else if (entry->offset + entry->bytes > offset)
1793		return entry;
1794
1795	if (!fuzzy)
1796		return NULL;
1797
1798	while (1) {
1799		n = rb_next(&entry->offset_index);
1800		if (!n)
1801			return NULL;
1802		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1803		if (entry->bitmap) {
1804			if (entry->offset + BITS_PER_BITMAP *
1805			    ctl->unit > offset)
1806				break;
1807		} else {
1808			if (entry->offset + entry->bytes > offset)
1809				break;
1810		}
1811	}
1812	return entry;
1813}
1814
1815static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1816				     struct btrfs_free_space *info,
1817				     bool update_stat)
1818{
1819	lockdep_assert_held(&ctl->tree_lock);
1820
1821	rb_erase(&info->offset_index, &ctl->free_space_offset);
1822	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1823	ctl->free_extents--;
1824
1825	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1826		ctl->discardable_extents[BTRFS_STAT_CURR]--;
1827		ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1828	}
1829
1830	if (update_stat)
1831		ctl->free_space -= info->bytes;
1832}
1833
1834static int link_free_space(struct btrfs_free_space_ctl *ctl,
1835			   struct btrfs_free_space *info)
1836{
1837	int ret = 0;
1838
1839	lockdep_assert_held(&ctl->tree_lock);
1840
1841	ASSERT(info->bytes || info->bitmap);
1842	ret = tree_insert_offset(ctl, NULL, info);
1843	if (ret)
1844		return ret;
1845
1846	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1847
1848	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1849		ctl->discardable_extents[BTRFS_STAT_CURR]++;
1850		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1851	}
1852
1853	ctl->free_space += info->bytes;
1854	ctl->free_extents++;
1855	return ret;
1856}
1857
1858static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
1859				struct btrfs_free_space *info)
1860{
1861	ASSERT(info->bitmap);
1862
1863	/*
1864	 * If our entry is empty it's because we're on a cluster and we don't
1865	 * want to re-link it into our ctl bytes index.
1866	 */
1867	if (RB_EMPTY_NODE(&info->bytes_index))
1868		return;
1869
1870	lockdep_assert_held(&ctl->tree_lock);
1871
1872	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1873	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1874}
1875
1876static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1877				     struct btrfs_free_space *info,
1878				     u64 offset, u64 bytes, bool update_stat)
1879{
1880	unsigned long start, count, end;
1881	int extent_delta = -1;
1882
1883	start = offset_to_bit(info->offset, ctl->unit, offset);
1884	count = bytes_to_bits(bytes, ctl->unit);
1885	end = start + count;
1886	ASSERT(end <= BITS_PER_BITMAP);
1887
1888	bitmap_clear(info->bitmap, start, count);
1889
1890	info->bytes -= bytes;
1891	if (info->max_extent_size > ctl->unit)
1892		info->max_extent_size = 0;
1893
1894	relink_bitmap_entry(ctl, info);
1895
1896	if (start && test_bit(start - 1, info->bitmap))
1897		extent_delta++;
1898
1899	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1900		extent_delta++;
1901
1902	info->bitmap_extents += extent_delta;
1903	if (!btrfs_free_space_trimmed(info)) {
1904		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1905		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1906	}
1907
1908	if (update_stat)
1909		ctl->free_space -= bytes;
1910}
1911
1912static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1913			    struct btrfs_free_space *info, u64 offset,
1914			    u64 bytes)
1915{
1916	unsigned long start, count, end;
1917	int extent_delta = 1;
1918
1919	start = offset_to_bit(info->offset, ctl->unit, offset);
1920	count = bytes_to_bits(bytes, ctl->unit);
1921	end = start + count;
1922	ASSERT(end <= BITS_PER_BITMAP);
1923
1924	bitmap_set(info->bitmap, start, count);
1925
1926	/*
1927	 * We set some bytes, we have no idea what the max extent size is
1928	 * anymore.
1929	 */
1930	info->max_extent_size = 0;
1931	info->bytes += bytes;
1932	ctl->free_space += bytes;
1933
1934	relink_bitmap_entry(ctl, info);
1935
1936	if (start && test_bit(start - 1, info->bitmap))
1937		extent_delta--;
1938
1939	if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1940		extent_delta--;
1941
1942	info->bitmap_extents += extent_delta;
1943	if (!btrfs_free_space_trimmed(info)) {
1944		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1945		ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1946	}
1947}
1948
1949/*
1950 * If we can not find suitable extent, we will use bytes to record
1951 * the size of the max extent.
1952 */
1953static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1954			 struct btrfs_free_space *bitmap_info, u64 *offset,
1955			 u64 *bytes, bool for_alloc)
1956{
1957	unsigned long found_bits = 0;
1958	unsigned long max_bits = 0;
1959	unsigned long bits, i;
1960	unsigned long next_zero;
1961	unsigned long extent_bits;
1962
1963	/*
1964	 * Skip searching the bitmap if we don't have a contiguous section that
1965	 * is large enough for this allocation.
1966	 */
1967	if (for_alloc &&
1968	    bitmap_info->max_extent_size &&
1969	    bitmap_info->max_extent_size < *bytes) {
1970		*bytes = bitmap_info->max_extent_size;
1971		return -1;
1972	}
1973
1974	i = offset_to_bit(bitmap_info->offset, ctl->unit,
1975			  max_t(u64, *offset, bitmap_info->offset));
1976	bits = bytes_to_bits(*bytes, ctl->unit);
1977
1978	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1979		if (for_alloc && bits == 1) {
1980			found_bits = 1;
1981			break;
1982		}
1983		next_zero = find_next_zero_bit(bitmap_info->bitmap,
1984					       BITS_PER_BITMAP, i);
1985		extent_bits = next_zero - i;
1986		if (extent_bits >= bits) {
1987			found_bits = extent_bits;
1988			break;
1989		} else if (extent_bits > max_bits) {
1990			max_bits = extent_bits;
1991		}
1992		i = next_zero;
1993	}
1994
1995	if (found_bits) {
1996		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1997		*bytes = (u64)(found_bits) * ctl->unit;
1998		return 0;
1999	}
2000
2001	*bytes = (u64)(max_bits) * ctl->unit;
2002	bitmap_info->max_extent_size = *bytes;
2003	relink_bitmap_entry(ctl, bitmap_info);
2004	return -1;
2005}
2006
2007/* Cache the size of the max extent in bytes */
2008static struct btrfs_free_space *
2009find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
2010		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
2011{
2012	struct btrfs_free_space *entry;
2013	struct rb_node *node;
2014	u64 tmp;
2015	u64 align_off;
2016	int ret;
2017
2018	if (!ctl->free_space_offset.rb_node)
2019		goto out;
2020again:
2021	if (use_bytes_index) {
2022		node = rb_first_cached(&ctl->free_space_bytes);
2023	} else {
2024		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
2025					   0, 1);
2026		if (!entry)
2027			goto out;
2028		node = &entry->offset_index;
2029	}
2030
2031	for (; node; node = rb_next(node)) {
2032		if (use_bytes_index)
2033			entry = rb_entry(node, struct btrfs_free_space,
2034					 bytes_index);
2035		else
2036			entry = rb_entry(node, struct btrfs_free_space,
2037					 offset_index);
2038
2039		/*
2040		 * If we are using the bytes index then all subsequent entries
2041		 * in this tree are going to be < bytes, so simply set the max
2042		 * extent size and exit the loop.
2043		 *
2044		 * If we're using the offset index then we need to keep going
2045		 * through the rest of the tree.
2046		 */
2047		if (entry->bytes < *bytes) {
2048			*max_extent_size = max(get_max_extent_size(entry),
2049					       *max_extent_size);
2050			if (use_bytes_index)
2051				break;
2052			continue;
2053		}
2054
2055		/* make sure the space returned is big enough
2056		 * to match our requested alignment
2057		 */
2058		if (*bytes >= align) {
2059			tmp = entry->offset - ctl->start + align - 1;
2060			tmp = div64_u64(tmp, align);
2061			tmp = tmp * align + ctl->start;
2062			align_off = tmp - entry->offset;
2063		} else {
2064			align_off = 0;
2065			tmp = entry->offset;
2066		}
2067
2068		/*
2069		 * We don't break here if we're using the bytes index because we
2070		 * may have another entry that has the correct alignment that is
2071		 * the right size, so we don't want to miss that possibility.
2072		 * At worst this adds another loop through the logic, but if we
2073		 * broke here we could prematurely ENOSPC.
2074		 */
2075		if (entry->bytes < *bytes + align_off) {
2076			*max_extent_size = max(get_max_extent_size(entry),
2077					       *max_extent_size);
2078			continue;
2079		}
2080
2081		if (entry->bitmap) {
2082			struct rb_node *old_next = rb_next(node);
2083			u64 size = *bytes;
2084
2085			ret = search_bitmap(ctl, entry, &tmp, &size, true);
2086			if (!ret) {
2087				*offset = tmp;
2088				*bytes = size;
2089				return entry;
2090			} else {
2091				*max_extent_size =
2092					max(get_max_extent_size(entry),
2093					    *max_extent_size);
2094			}
2095
2096			/*
2097			 * The bitmap may have gotten re-arranged in the space
2098			 * index here because the max_extent_size may have been
2099			 * updated.  Start from the beginning again if this
2100			 * happened.
2101			 */
2102			if (use_bytes_index && old_next != rb_next(node))
2103				goto again;
2104			continue;
2105		}
2106
2107		*offset = tmp;
2108		*bytes = entry->bytes - align_off;
2109		return entry;
2110	}
2111out:
2112	return NULL;
2113}
2114
2115static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
2116			   struct btrfs_free_space *info, u64 offset)
2117{
2118	info->offset = offset_to_bitmap(ctl, offset);
2119	info->bytes = 0;
2120	info->bitmap_extents = 0;
2121	INIT_LIST_HEAD(&info->list);
2122	link_free_space(ctl, info);
2123	ctl->total_bitmaps++;
2124	recalculate_thresholds(ctl);
2125}
2126
2127static void free_bitmap(struct btrfs_free_space_ctl *ctl,
2128			struct btrfs_free_space *bitmap_info)
2129{
2130	/*
2131	 * Normally when this is called, the bitmap is completely empty. However,
2132	 * if we are blowing up the free space cache for one reason or another
2133	 * via __btrfs_remove_free_space_cache(), then it may not be freed and
2134	 * we may leave stats on the table.
2135	 */
2136	if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
2137		ctl->discardable_extents[BTRFS_STAT_CURR] -=
2138			bitmap_info->bitmap_extents;
2139		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
2140
2141	}
2142	unlink_free_space(ctl, bitmap_info, true);
2143	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
2144	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
2145	ctl->total_bitmaps--;
2146	recalculate_thresholds(ctl);
2147}
2148
2149static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
2150			      struct btrfs_free_space *bitmap_info,
2151			      u64 *offset, u64 *bytes)
2152{
2153	u64 end;
2154	u64 search_start, search_bytes;
2155	int ret;
2156
2157again:
2158	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
2159
2160	/*
2161	 * We need to search for bits in this bitmap.  We could only cover some
2162	 * of the extent in this bitmap thanks to how we add space, so we need
2163	 * to search for as much as it as we can and clear that amount, and then
2164	 * go searching for the next bit.
2165	 */
2166	search_start = *offset;
2167	search_bytes = ctl->unit;
2168	search_bytes = min(search_bytes, end - search_start + 1);
2169	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2170			    false);
2171	if (ret < 0 || search_start != *offset)
2172		return -EINVAL;
2173
2174	/* We may have found more bits than what we need */
2175	search_bytes = min(search_bytes, *bytes);
2176
2177	/* Cannot clear past the end of the bitmap */
2178	search_bytes = min(search_bytes, end - search_start + 1);
2179
2180	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
2181	*offset += search_bytes;
2182	*bytes -= search_bytes;
2183
2184	if (*bytes) {
2185		struct rb_node *next = rb_next(&bitmap_info->offset_index);
2186		if (!bitmap_info->bytes)
2187			free_bitmap(ctl, bitmap_info);
2188
2189		/*
2190		 * no entry after this bitmap, but we still have bytes to
2191		 * remove, so something has gone wrong.
2192		 */
2193		if (!next)
2194			return -EINVAL;
2195
2196		bitmap_info = rb_entry(next, struct btrfs_free_space,
2197				       offset_index);
2198
2199		/*
2200		 * if the next entry isn't a bitmap we need to return to let the
2201		 * extent stuff do its work.
2202		 */
2203		if (!bitmap_info->bitmap)
2204			return -EAGAIN;
2205
2206		/*
2207		 * Ok the next item is a bitmap, but it may not actually hold
2208		 * the information for the rest of this free space stuff, so
2209		 * look for it, and if we don't find it return so we can try
2210		 * everything over again.
2211		 */
2212		search_start = *offset;
2213		search_bytes = ctl->unit;
2214		ret = search_bitmap(ctl, bitmap_info, &search_start,
2215				    &search_bytes, false);
2216		if (ret < 0 || search_start != *offset)
2217			return -EAGAIN;
2218
2219		goto again;
2220	} else if (!bitmap_info->bytes)
2221		free_bitmap(ctl, bitmap_info);
2222
2223	return 0;
2224}
2225
2226static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2227			       struct btrfs_free_space *info, u64 offset,
2228			       u64 bytes, enum btrfs_trim_state trim_state)
2229{
2230	u64 bytes_to_set = 0;
2231	u64 end;
2232
2233	/*
2234	 * This is a tradeoff to make bitmap trim state minimal.  We mark the
2235	 * whole bitmap untrimmed if at any point we add untrimmed regions.
2236	 */
2237	if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2238		if (btrfs_free_space_trimmed(info)) {
2239			ctl->discardable_extents[BTRFS_STAT_CURR] +=
2240				info->bitmap_extents;
2241			ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2242		}
2243		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2244	}
2245
2246	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2247
2248	bytes_to_set = min(end - offset, bytes);
2249
2250	bitmap_set_bits(ctl, info, offset, bytes_to_set);
2251
2252	return bytes_to_set;
2253
2254}
2255
2256static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2257		      struct btrfs_free_space *info)
2258{
2259	struct btrfs_block_group *block_group = ctl->block_group;
2260	struct btrfs_fs_info *fs_info = block_group->fs_info;
2261	bool forced = false;
2262
2263#ifdef CONFIG_BTRFS_DEBUG
2264	if (btrfs_should_fragment_free_space(block_group))
2265		forced = true;
2266#endif
2267
2268	/* This is a way to reclaim large regions from the bitmaps. */
2269	if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2270		return false;
2271
2272	/*
2273	 * If we are below the extents threshold then we can add this as an
2274	 * extent, and don't have to deal with the bitmap
2275	 */
2276	if (!forced && ctl->free_extents < ctl->extents_thresh) {
2277		/*
2278		 * If this block group has some small extents we don't want to
2279		 * use up all of our free slots in the cache with them, we want
2280		 * to reserve them to larger extents, however if we have plenty
2281		 * of cache left then go ahead an dadd them, no sense in adding
2282		 * the overhead of a bitmap if we don't have to.
2283		 */
2284		if (info->bytes <= fs_info->sectorsize * 8) {
2285			if (ctl->free_extents * 3 <= ctl->extents_thresh)
2286				return false;
2287		} else {
2288			return false;
2289		}
2290	}
2291
2292	/*
2293	 * The original block groups from mkfs can be really small, like 8
2294	 * megabytes, so don't bother with a bitmap for those entries.  However
2295	 * some block groups can be smaller than what a bitmap would cover but
2296	 * are still large enough that they could overflow the 32k memory limit,
2297	 * so allow those block groups to still be allowed to have a bitmap
2298	 * entry.
2299	 */
2300	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2301		return false;
2302
2303	return true;
2304}
2305
2306static const struct btrfs_free_space_op free_space_op = {
2307	.use_bitmap		= use_bitmap,
2308};
2309
2310static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2311			      struct btrfs_free_space *info)
2312{
2313	struct btrfs_free_space *bitmap_info;
2314	struct btrfs_block_group *block_group = NULL;
2315	int added = 0;
2316	u64 bytes, offset, bytes_added;
2317	enum btrfs_trim_state trim_state;
2318	int ret;
2319
2320	bytes = info->bytes;
2321	offset = info->offset;
2322	trim_state = info->trim_state;
2323
2324	if (!ctl->op->use_bitmap(ctl, info))
2325		return 0;
2326
2327	if (ctl->op == &free_space_op)
2328		block_group = ctl->block_group;
2329again:
2330	/*
2331	 * Since we link bitmaps right into the cluster we need to see if we
2332	 * have a cluster here, and if so and it has our bitmap we need to add
2333	 * the free space to that bitmap.
2334	 */
2335	if (block_group && !list_empty(&block_group->cluster_list)) {
2336		struct btrfs_free_cluster *cluster;
2337		struct rb_node *node;
2338		struct btrfs_free_space *entry;
2339
2340		cluster = list_entry(block_group->cluster_list.next,
2341				     struct btrfs_free_cluster,
2342				     block_group_list);
2343		spin_lock(&cluster->lock);
2344		node = rb_first(&cluster->root);
2345		if (!node) {
2346			spin_unlock(&cluster->lock);
2347			goto no_cluster_bitmap;
2348		}
2349
2350		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2351		if (!entry->bitmap) {
2352			spin_unlock(&cluster->lock);
2353			goto no_cluster_bitmap;
2354		}
2355
2356		if (entry->offset == offset_to_bitmap(ctl, offset)) {
2357			bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2358							  bytes, trim_state);
2359			bytes -= bytes_added;
2360			offset += bytes_added;
2361		}
2362		spin_unlock(&cluster->lock);
2363		if (!bytes) {
2364			ret = 1;
2365			goto out;
2366		}
2367	}
2368
2369no_cluster_bitmap:
2370	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2371					 1, 0);
2372	if (!bitmap_info) {
2373		ASSERT(added == 0);
2374		goto new_bitmap;
2375	}
2376
2377	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2378					  trim_state);
2379	bytes -= bytes_added;
2380	offset += bytes_added;
2381	added = 0;
2382
2383	if (!bytes) {
2384		ret = 1;
2385		goto out;
2386	} else
2387		goto again;
2388
2389new_bitmap:
2390	if (info && info->bitmap) {
2391		add_new_bitmap(ctl, info, offset);
2392		added = 1;
2393		info = NULL;
2394		goto again;
2395	} else {
2396		spin_unlock(&ctl->tree_lock);
2397
2398		/* no pre-allocated info, allocate a new one */
2399		if (!info) {
2400			info = kmem_cache_zalloc(btrfs_free_space_cachep,
2401						 GFP_NOFS);
2402			if (!info) {
2403				spin_lock(&ctl->tree_lock);
2404				ret = -ENOMEM;
2405				goto out;
2406			}
2407		}
2408
2409		/* allocate the bitmap */
2410		info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2411						 GFP_NOFS);
2412		info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2413		spin_lock(&ctl->tree_lock);
2414		if (!info->bitmap) {
2415			ret = -ENOMEM;
2416			goto out;
2417		}
2418		goto again;
2419	}
2420
2421out:
2422	if (info) {
2423		if (info->bitmap)
2424			kmem_cache_free(btrfs_free_space_bitmap_cachep,
2425					info->bitmap);
2426		kmem_cache_free(btrfs_free_space_cachep, info);
2427	}
2428
2429	return ret;
2430}
2431
2432/*
2433 * Free space merging rules:
2434 *  1) Merge trimmed areas together
2435 *  2) Let untrimmed areas coalesce with trimmed areas
2436 *  3) Always pull neighboring regions from bitmaps
2437 *
2438 * The above rules are for when we merge free space based on btrfs_trim_state.
2439 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2440 * same reason: to promote larger extent regions which makes life easier for
2441 * find_free_extent().  Rule 2 enables coalescing based on the common path
2442 * being returning free space from btrfs_finish_extent_commit().  So when free
2443 * space is trimmed, it will prevent aggregating trimmed new region and
2444 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2445 * and provide find_free_extent() with the largest extents possible hoping for
2446 * the reuse path.
2447 */
2448static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2449			  struct btrfs_free_space *info, bool update_stat)
2450{
2451	struct btrfs_free_space *left_info = NULL;
2452	struct btrfs_free_space *right_info;
2453	bool merged = false;
2454	u64 offset = info->offset;
2455	u64 bytes = info->bytes;
2456	const bool is_trimmed = btrfs_free_space_trimmed(info);
2457	struct rb_node *right_prev = NULL;
2458
2459	/*
2460	 * first we want to see if there is free space adjacent to the range we
2461	 * are adding, if there is remove that struct and add a new one to
2462	 * cover the entire range
2463	 */
2464	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2465	if (right_info)
2466		right_prev = rb_prev(&right_info->offset_index);
2467
2468	if (right_prev)
2469		left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index);
2470	else if (!right_info)
2471		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2472
2473	/* See try_merge_free_space() comment. */
2474	if (right_info && !right_info->bitmap &&
2475	    (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2476		unlink_free_space(ctl, right_info, update_stat);
2477		info->bytes += right_info->bytes;
2478		kmem_cache_free(btrfs_free_space_cachep, right_info);
2479		merged = true;
2480	}
2481
2482	/* See try_merge_free_space() comment. */
2483	if (left_info && !left_info->bitmap &&
2484	    left_info->offset + left_info->bytes == offset &&
2485	    (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2486		unlink_free_space(ctl, left_info, update_stat);
2487		info->offset = left_info->offset;
2488		info->bytes += left_info->bytes;
2489		kmem_cache_free(btrfs_free_space_cachep, left_info);
2490		merged = true;
2491	}
2492
2493	return merged;
2494}
2495
2496static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2497				     struct btrfs_free_space *info,
2498				     bool update_stat)
2499{
2500	struct btrfs_free_space *bitmap;
2501	unsigned long i;
2502	unsigned long j;
2503	const u64 end = info->offset + info->bytes;
2504	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2505	u64 bytes;
2506
2507	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2508	if (!bitmap)
2509		return false;
2510
2511	i = offset_to_bit(bitmap->offset, ctl->unit, end);
2512	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2513	if (j == i)
2514		return false;
2515	bytes = (j - i) * ctl->unit;
2516	info->bytes += bytes;
2517
2518	/* See try_merge_free_space() comment. */
2519	if (!btrfs_free_space_trimmed(bitmap))
2520		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2521
2522	bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
2523
2524	if (!bitmap->bytes)
2525		free_bitmap(ctl, bitmap);
2526
2527	return true;
2528}
2529
2530static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2531				       struct btrfs_free_space *info,
2532				       bool update_stat)
2533{
2534	struct btrfs_free_space *bitmap;
2535	u64 bitmap_offset;
2536	unsigned long i;
2537	unsigned long j;
2538	unsigned long prev_j;
2539	u64 bytes;
2540
2541	bitmap_offset = offset_to_bitmap(ctl, info->offset);
2542	/* If we're on a boundary, try the previous logical bitmap. */
2543	if (bitmap_offset == info->offset) {
2544		if (info->offset == 0)
2545			return false;
2546		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2547	}
2548
2549	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2550	if (!bitmap)
2551		return false;
2552
2553	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2554	j = 0;
2555	prev_j = (unsigned long)-1;
2556	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2557		if (j > i)
2558			break;
2559		prev_j = j;
2560	}
2561	if (prev_j == i)
2562		return false;
2563
2564	if (prev_j == (unsigned long)-1)
2565		bytes = (i + 1) * ctl->unit;
2566	else
2567		bytes = (i - prev_j) * ctl->unit;
2568
2569	info->offset -= bytes;
2570	info->bytes += bytes;
2571
2572	/* See try_merge_free_space() comment. */
2573	if (!btrfs_free_space_trimmed(bitmap))
2574		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2575
2576	bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
2577
2578	if (!bitmap->bytes)
2579		free_bitmap(ctl, bitmap);
2580
2581	return true;
2582}
2583
2584/*
2585 * We prefer always to allocate from extent entries, both for clustered and
2586 * non-clustered allocation requests. So when attempting to add a new extent
2587 * entry, try to see if there's adjacent free space in bitmap entries, and if
2588 * there is, migrate that space from the bitmaps to the extent.
2589 * Like this we get better chances of satisfying space allocation requests
2590 * because we attempt to satisfy them based on a single cache entry, and never
2591 * on 2 or more entries - even if the entries represent a contiguous free space
2592 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2593 * ends).
2594 */
2595static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2596			      struct btrfs_free_space *info,
2597			      bool update_stat)
2598{
2599	/*
2600	 * Only work with disconnected entries, as we can change their offset,
2601	 * and must be extent entries.
2602	 */
2603	ASSERT(!info->bitmap);
2604	ASSERT(RB_EMPTY_NODE(&info->offset_index));
2605
2606	if (ctl->total_bitmaps > 0) {
2607		bool stole_end;
2608		bool stole_front = false;
2609
2610		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2611		if (ctl->total_bitmaps > 0)
2612			stole_front = steal_from_bitmap_to_front(ctl, info,
2613								 update_stat);
2614
2615		if (stole_end || stole_front)
2616			try_merge_free_space(ctl, info, update_stat);
2617	}
2618}
2619
2620int __btrfs_add_free_space(struct btrfs_block_group *block_group,
2621			   u64 offset, u64 bytes,
2622			   enum btrfs_trim_state trim_state)
2623{
2624	struct btrfs_fs_info *fs_info = block_group->fs_info;
2625	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2626	struct btrfs_free_space *info;
2627	int ret = 0;
2628	u64 filter_bytes = bytes;
2629
2630	ASSERT(!btrfs_is_zoned(fs_info));
2631
2632	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2633	if (!info)
2634		return -ENOMEM;
2635
2636	info->offset = offset;
2637	info->bytes = bytes;
2638	info->trim_state = trim_state;
2639	RB_CLEAR_NODE(&info->offset_index);
2640	RB_CLEAR_NODE(&info->bytes_index);
2641
2642	spin_lock(&ctl->tree_lock);
2643
2644	if (try_merge_free_space(ctl, info, true))
2645		goto link;
2646
2647	/*
2648	 * There was no extent directly to the left or right of this new
2649	 * extent then we know we're going to have to allocate a new extent, so
2650	 * before we do that see if we need to drop this into a bitmap
2651	 */
2652	ret = insert_into_bitmap(ctl, info);
2653	if (ret < 0) {
2654		goto out;
2655	} else if (ret) {
2656		ret = 0;
2657		goto out;
2658	}
2659link:
2660	/*
2661	 * Only steal free space from adjacent bitmaps if we're sure we're not
2662	 * going to add the new free space to existing bitmap entries - because
2663	 * that would mean unnecessary work that would be reverted. Therefore
2664	 * attempt to steal space from bitmaps if we're adding an extent entry.
2665	 */
2666	steal_from_bitmap(ctl, info, true);
2667
2668	filter_bytes = max(filter_bytes, info->bytes);
2669
2670	ret = link_free_space(ctl, info);
2671	if (ret)
2672		kmem_cache_free(btrfs_free_space_cachep, info);
2673out:
2674	btrfs_discard_update_discardable(block_group);
2675	spin_unlock(&ctl->tree_lock);
2676
2677	if (ret) {
2678		btrfs_crit(fs_info, "unable to add free space :%d", ret);
2679		ASSERT(ret != -EEXIST);
2680	}
2681
2682	if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2683		btrfs_discard_check_filter(block_group, filter_bytes);
2684		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2685	}
2686
2687	return ret;
2688}
2689
2690static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2691					u64 bytenr, u64 size, bool used)
2692{
2693	struct btrfs_space_info *sinfo = block_group->space_info;
2694	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2695	u64 offset = bytenr - block_group->start;
2696	u64 to_free, to_unusable;
2697	int bg_reclaim_threshold = 0;
2698	bool initial = (size == block_group->length);
2699	u64 reclaimable_unusable;
2700
2701	WARN_ON(!initial && offset + size > block_group->zone_capacity);
2702
2703	if (!initial)
2704		bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
2705
2706	spin_lock(&ctl->tree_lock);
2707	if (!used)
2708		to_free = size;
2709	else if (initial)
2710		to_free = block_group->zone_capacity;
2711	else if (offset >= block_group->alloc_offset)
2712		to_free = size;
2713	else if (offset + size <= block_group->alloc_offset)
2714		to_free = 0;
2715	else
2716		to_free = offset + size - block_group->alloc_offset;
2717	to_unusable = size - to_free;
2718
2719	ctl->free_space += to_free;
2720	/*
2721	 * If the block group is read-only, we should account freed space into
2722	 * bytes_readonly.
2723	 */
2724	if (!block_group->ro)
2725		block_group->zone_unusable += to_unusable;
2726	spin_unlock(&ctl->tree_lock);
2727	if (!used) {
2728		spin_lock(&block_group->lock);
2729		block_group->alloc_offset -= size;
2730		spin_unlock(&block_group->lock);
2731	}
2732
2733	reclaimable_unusable = block_group->zone_unusable -
2734			       (block_group->length - block_group->zone_capacity);
2735	/* All the region is now unusable. Mark it as unused and reclaim */
2736	if (block_group->zone_unusable == block_group->length) {
2737		btrfs_mark_bg_unused(block_group);
2738	} else if (bg_reclaim_threshold &&
2739		   reclaimable_unusable >=
2740		   mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
2741		btrfs_mark_bg_to_reclaim(block_group);
2742	}
2743
2744	return 0;
2745}
2746
2747int btrfs_add_free_space(struct btrfs_block_group *block_group,
2748			 u64 bytenr, u64 size)
2749{
2750	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2751
2752	if (btrfs_is_zoned(block_group->fs_info))
2753		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2754						    true);
2755
2756	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2757		trim_state = BTRFS_TRIM_STATE_TRIMMED;
2758
2759	return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2760}
2761
2762int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2763				u64 bytenr, u64 size)
2764{
2765	if (btrfs_is_zoned(block_group->fs_info))
2766		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2767						    false);
2768
2769	return btrfs_add_free_space(block_group, bytenr, size);
2770}
2771
2772/*
2773 * This is a subtle distinction because when adding free space back in general,
2774 * we want it to be added as untrimmed for async. But in the case where we add
2775 * it on loading of a block group, we want to consider it trimmed.
2776 */
2777int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2778				       u64 bytenr, u64 size)
2779{
2780	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2781
2782	if (btrfs_is_zoned(block_group->fs_info))
2783		return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2784						    true);
2785
2786	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2787	    btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2788		trim_state = BTRFS_TRIM_STATE_TRIMMED;
2789
2790	return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2791}
2792
2793int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2794			    u64 offset, u64 bytes)
2795{
2796	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2797	struct btrfs_free_space *info;
2798	int ret;
2799	bool re_search = false;
2800
2801	if (btrfs_is_zoned(block_group->fs_info)) {
2802		/*
2803		 * This can happen with conventional zones when replaying log.
2804		 * Since the allocation info of tree-log nodes are not recorded
2805		 * to the extent-tree, calculate_alloc_pointer() failed to
2806		 * advance the allocation pointer after last allocated tree log
2807		 * node blocks.
2808		 *
2809		 * This function is called from
2810		 * btrfs_pin_extent_for_log_replay() when replaying the log.
2811		 * Advance the pointer not to overwrite the tree-log nodes.
2812		 */
2813		if (block_group->start + block_group->alloc_offset <
2814		    offset + bytes) {
2815			block_group->alloc_offset =
2816				offset + bytes - block_group->start;
2817		}
2818		return 0;
2819	}
2820
2821	spin_lock(&ctl->tree_lock);
2822
2823again:
2824	ret = 0;
2825	if (!bytes)
2826		goto out_lock;
2827
2828	info = tree_search_offset(ctl, offset, 0, 0);
2829	if (!info) {
2830		/*
2831		 * oops didn't find an extent that matched the space we wanted
2832		 * to remove, look for a bitmap instead
2833		 */
2834		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2835					  1, 0);
2836		if (!info) {
2837			/*
2838			 * If we found a partial bit of our free space in a
2839			 * bitmap but then couldn't find the other part this may
2840			 * be a problem, so WARN about it.
2841			 */
2842			WARN_ON(re_search);
2843			goto out_lock;
2844		}
2845	}
2846
2847	re_search = false;
2848	if (!info->bitmap) {
2849		unlink_free_space(ctl, info, true);
2850		if (offset == info->offset) {
2851			u64 to_free = min(bytes, info->bytes);
2852
2853			info->bytes -= to_free;
2854			info->offset += to_free;
2855			if (info->bytes) {
2856				ret = link_free_space(ctl, info);
2857				WARN_ON(ret);
2858			} else {
2859				kmem_cache_free(btrfs_free_space_cachep, info);
2860			}
2861
2862			offset += to_free;
2863			bytes -= to_free;
2864			goto again;
2865		} else {
2866			u64 old_end = info->bytes + info->offset;
2867
2868			info->bytes = offset - info->offset;
2869			ret = link_free_space(ctl, info);
2870			WARN_ON(ret);
2871			if (ret)
2872				goto out_lock;
2873
2874			/* Not enough bytes in this entry to satisfy us */
2875			if (old_end < offset + bytes) {
2876				bytes -= old_end - offset;
2877				offset = old_end;
2878				goto again;
2879			} else if (old_end == offset + bytes) {
2880				/* all done */
2881				goto out_lock;
2882			}
2883			spin_unlock(&ctl->tree_lock);
2884
2885			ret = __btrfs_add_free_space(block_group,
2886						     offset + bytes,
2887						     old_end - (offset + bytes),
2888						     info->trim_state);
2889			WARN_ON(ret);
2890			goto out;
2891		}
2892	}
2893
2894	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2895	if (ret == -EAGAIN) {
2896		re_search = true;
2897		goto again;
2898	}
2899out_lock:
2900	btrfs_discard_update_discardable(block_group);
2901	spin_unlock(&ctl->tree_lock);
2902out:
2903	return ret;
2904}
2905
2906void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2907			   u64 bytes)
2908{
2909	struct btrfs_fs_info *fs_info = block_group->fs_info;
2910	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2911	struct btrfs_free_space *info;
2912	struct rb_node *n;
2913	int count = 0;
2914
2915	/*
2916	 * Zoned btrfs does not use free space tree and cluster. Just print
2917	 * out the free space after the allocation offset.
2918	 */
2919	if (btrfs_is_zoned(fs_info)) {
2920		btrfs_info(fs_info, "free space %llu active %d",
2921			   block_group->zone_capacity - block_group->alloc_offset,
2922			   test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2923				    &block_group->runtime_flags));
2924		return;
2925	}
2926
2927	spin_lock(&ctl->tree_lock);
2928	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2929		info = rb_entry(n, struct btrfs_free_space, offset_index);
2930		if (info->bytes >= bytes && !block_group->ro)
2931			count++;
2932		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2933			   info->offset, info->bytes,
2934		       (info->bitmap) ? "yes" : "no");
2935	}
2936	spin_unlock(&ctl->tree_lock);
2937	btrfs_info(fs_info, "block group has cluster?: %s",
2938	       list_empty(&block_group->cluster_list) ? "no" : "yes");
2939	btrfs_info(fs_info,
2940		   "%d free space entries at or bigger than %llu bytes",
2941		   count, bytes);
2942}
2943
2944void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2945			       struct btrfs_free_space_ctl *ctl)
2946{
2947	struct btrfs_fs_info *fs_info = block_group->fs_info;
2948
2949	spin_lock_init(&ctl->tree_lock);
2950	ctl->unit = fs_info->sectorsize;
2951	ctl->start = block_group->start;
2952	ctl->block_group = block_group;
2953	ctl->op = &free_space_op;
2954	ctl->free_space_bytes = RB_ROOT_CACHED;
2955	INIT_LIST_HEAD(&ctl->trimming_ranges);
2956	mutex_init(&ctl->cache_writeout_mutex);
2957
2958	/*
2959	 * we only want to have 32k of ram per block group for keeping
2960	 * track of free space, and if we pass 1/2 of that we want to
2961	 * start converting things over to using bitmaps
2962	 */
2963	ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2964}
2965
2966/*
2967 * for a given cluster, put all of its extents back into the free
2968 * space cache.  If the block group passed doesn't match the block group
2969 * pointed to by the cluster, someone else raced in and freed the
2970 * cluster already.  In that case, we just return without changing anything
2971 */
2972static void __btrfs_return_cluster_to_free_space(
2973			     struct btrfs_block_group *block_group,
2974			     struct btrfs_free_cluster *cluster)
2975{
2976	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2977	struct rb_node *node;
2978
2979	lockdep_assert_held(&ctl->tree_lock);
2980
2981	spin_lock(&cluster->lock);
2982	if (cluster->block_group != block_group) {
2983		spin_unlock(&cluster->lock);
2984		return;
2985	}
2986
2987	cluster->block_group = NULL;
2988	cluster->window_start = 0;
2989	list_del_init(&cluster->block_group_list);
2990
2991	node = rb_first(&cluster->root);
2992	while (node) {
2993		struct btrfs_free_space *entry;
2994
2995		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2996		node = rb_next(&entry->offset_index);
2997		rb_erase(&entry->offset_index, &cluster->root);
2998		RB_CLEAR_NODE(&entry->offset_index);
2999
3000		if (!entry->bitmap) {
3001			/* Merging treats extents as if they were new */
3002			if (!btrfs_free_space_trimmed(entry)) {
3003				ctl->discardable_extents[BTRFS_STAT_CURR]--;
3004				ctl->discardable_bytes[BTRFS_STAT_CURR] -=
3005					entry->bytes;
3006			}
3007
3008			try_merge_free_space(ctl, entry, false);
3009			steal_from_bitmap(ctl, entry, false);
3010
3011			/* As we insert directly, update these statistics */
3012			if (!btrfs_free_space_trimmed(entry)) {
3013				ctl->discardable_extents[BTRFS_STAT_CURR]++;
3014				ctl->discardable_bytes[BTRFS_STAT_CURR] +=
3015					entry->bytes;
3016			}
3017		}
3018		tree_insert_offset(ctl, NULL, entry);
3019		rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
3020			      entry_less);
3021	}
3022	cluster->root = RB_ROOT;
3023	spin_unlock(&cluster->lock);
3024	btrfs_put_block_group(block_group);
3025}
3026
3027void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
3028{
3029	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3030	struct btrfs_free_cluster *cluster;
3031	struct list_head *head;
3032
3033	spin_lock(&ctl->tree_lock);
3034	while ((head = block_group->cluster_list.next) !=
3035	       &block_group->cluster_list) {
3036		cluster = list_entry(head, struct btrfs_free_cluster,
3037				     block_group_list);
3038
3039		WARN_ON(cluster->block_group != block_group);
3040		__btrfs_return_cluster_to_free_space(block_group, cluster);
3041
3042		cond_resched_lock(&ctl->tree_lock);
3043	}
3044	__btrfs_remove_free_space_cache(ctl);
3045	btrfs_discard_update_discardable(block_group);
3046	spin_unlock(&ctl->tree_lock);
3047
3048}
3049
3050/*
3051 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3052 */
3053bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
3054{
3055	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3056	struct btrfs_free_space *info;
3057	struct rb_node *node;
3058	bool ret = true;
3059
3060	spin_lock(&ctl->tree_lock);
3061	node = rb_first(&ctl->free_space_offset);
3062
3063	while (node) {
3064		info = rb_entry(node, struct btrfs_free_space, offset_index);
3065
3066		if (!btrfs_free_space_trimmed(info)) {
3067			ret = false;
3068			break;
3069		}
3070
3071		node = rb_next(node);
3072	}
3073
3074	spin_unlock(&ctl->tree_lock);
3075	return ret;
3076}
3077
3078u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
3079			       u64 offset, u64 bytes, u64 empty_size,
3080			       u64 *max_extent_size)
3081{
3082	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3083	struct btrfs_discard_ctl *discard_ctl =
3084					&block_group->fs_info->discard_ctl;
3085	struct btrfs_free_space *entry = NULL;
3086	u64 bytes_search = bytes + empty_size;
3087	u64 ret = 0;
3088	u64 align_gap = 0;
3089	u64 align_gap_len = 0;
3090	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3091	bool use_bytes_index = (offset == block_group->start);
3092
3093	ASSERT(!btrfs_is_zoned(block_group->fs_info));
3094
3095	spin_lock(&ctl->tree_lock);
3096	entry = find_free_space(ctl, &offset, &bytes_search,
3097				block_group->full_stripe_len, max_extent_size,
3098				use_bytes_index);
3099	if (!entry)
3100		goto out;
3101
3102	ret = offset;
3103	if (entry->bitmap) {
3104		bitmap_clear_bits(ctl, entry, offset, bytes, true);
3105
3106		if (!btrfs_free_space_trimmed(entry))
3107			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3108
3109		if (!entry->bytes)
3110			free_bitmap(ctl, entry);
3111	} else {
3112		unlink_free_space(ctl, entry, true);
3113		align_gap_len = offset - entry->offset;
3114		align_gap = entry->offset;
3115		align_gap_trim_state = entry->trim_state;
3116
3117		if (!btrfs_free_space_trimmed(entry))
3118			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3119
3120		entry->offset = offset + bytes;
3121		WARN_ON(entry->bytes < bytes + align_gap_len);
3122
3123		entry->bytes -= bytes + align_gap_len;
3124		if (!entry->bytes)
3125			kmem_cache_free(btrfs_free_space_cachep, entry);
3126		else
3127			link_free_space(ctl, entry);
3128	}
3129out:
3130	btrfs_discard_update_discardable(block_group);
3131	spin_unlock(&ctl->tree_lock);
3132
3133	if (align_gap_len)
3134		__btrfs_add_free_space(block_group, align_gap, align_gap_len,
3135				       align_gap_trim_state);
3136	return ret;
3137}
3138
3139/*
3140 * given a cluster, put all of its extents back into the free space
3141 * cache.  If a block group is passed, this function will only free
3142 * a cluster that belongs to the passed block group.
3143 *
3144 * Otherwise, it'll get a reference on the block group pointed to by the
3145 * cluster and remove the cluster from it.
3146 */
3147void btrfs_return_cluster_to_free_space(
3148			       struct btrfs_block_group *block_group,
3149			       struct btrfs_free_cluster *cluster)
3150{
3151	struct btrfs_free_space_ctl *ctl;
3152
3153	/* first, get a safe pointer to the block group */
3154	spin_lock(&cluster->lock);
3155	if (!block_group) {
3156		block_group = cluster->block_group;
3157		if (!block_group) {
3158			spin_unlock(&cluster->lock);
3159			return;
3160		}
3161	} else if (cluster->block_group != block_group) {
3162		/* someone else has already freed it don't redo their work */
3163		spin_unlock(&cluster->lock);
3164		return;
3165	}
3166	btrfs_get_block_group(block_group);
3167	spin_unlock(&cluster->lock);
3168
3169	ctl = block_group->free_space_ctl;
3170
3171	/* now return any extents the cluster had on it */
3172	spin_lock(&ctl->tree_lock);
3173	__btrfs_return_cluster_to_free_space(block_group, cluster);
3174	spin_unlock(&ctl->tree_lock);
3175
3176	btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3177
3178	/* finally drop our ref */
3179	btrfs_put_block_group(block_group);
3180}
3181
3182static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3183				   struct btrfs_free_cluster *cluster,
3184				   struct btrfs_free_space *entry,
3185				   u64 bytes, u64 min_start,
3186				   u64 *max_extent_size)
3187{
3188	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3189	int err;
3190	u64 search_start = cluster->window_start;
3191	u64 search_bytes = bytes;
3192	u64 ret = 0;
3193
3194	search_start = min_start;
3195	search_bytes = bytes;
3196
3197	err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3198	if (err) {
3199		*max_extent_size = max(get_max_extent_size(entry),
3200				       *max_extent_size);
3201		return 0;
3202	}
3203
3204	ret = search_start;
3205	bitmap_clear_bits(ctl, entry, ret, bytes, false);
3206
3207	return ret;
3208}
3209
3210/*
3211 * given a cluster, try to allocate 'bytes' from it, returns 0
3212 * if it couldn't find anything suitably large, or a logical disk offset
3213 * if things worked out
3214 */
3215u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3216			     struct btrfs_free_cluster *cluster, u64 bytes,
3217			     u64 min_start, u64 *max_extent_size)
3218{
3219	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3220	struct btrfs_discard_ctl *discard_ctl =
3221					&block_group->fs_info->discard_ctl;
3222	struct btrfs_free_space *entry = NULL;
3223	struct rb_node *node;
3224	u64 ret = 0;
3225
3226	ASSERT(!btrfs_is_zoned(block_group->fs_info));
3227
3228	spin_lock(&cluster->lock);
3229	if (bytes > cluster->max_size)
3230		goto out;
3231
3232	if (cluster->block_group != block_group)
3233		goto out;
3234
3235	node = rb_first(&cluster->root);
3236	if (!node)
3237		goto out;
3238
3239	entry = rb_entry(node, struct btrfs_free_space, offset_index);
3240	while (1) {
3241		if (entry->bytes < bytes)
3242			*max_extent_size = max(get_max_extent_size(entry),
3243					       *max_extent_size);
3244
3245		if (entry->bytes < bytes ||
3246		    (!entry->bitmap && entry->offset < min_start)) {
3247			node = rb_next(&entry->offset_index);
3248			if (!node)
3249				break;
3250			entry = rb_entry(node, struct btrfs_free_space,
3251					 offset_index);
3252			continue;
3253		}
3254
3255		if (entry->bitmap) {
3256			ret = btrfs_alloc_from_bitmap(block_group,
3257						      cluster, entry, bytes,
3258						      cluster->window_start,
3259						      max_extent_size);
3260			if (ret == 0) {
3261				node = rb_next(&entry->offset_index);
3262				if (!node)
3263					break;
3264				entry = rb_entry(node, struct btrfs_free_space,
3265						 offset_index);
3266				continue;
3267			}
3268			cluster->window_start += bytes;
3269		} else {
3270			ret = entry->offset;
3271
3272			entry->offset += bytes;
3273			entry->bytes -= bytes;
3274		}
3275
3276		break;
3277	}
3278out:
3279	spin_unlock(&cluster->lock);
3280
3281	if (!ret)
3282		return 0;
3283
3284	spin_lock(&ctl->tree_lock);
3285
3286	if (!btrfs_free_space_trimmed(entry))
3287		atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3288
3289	ctl->free_space -= bytes;
3290	if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3291		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3292
3293	spin_lock(&cluster->lock);
3294	if (entry->bytes == 0) {
3295		rb_erase(&entry->offset_index, &cluster->root);
3296		ctl->free_extents--;
3297		if (entry->bitmap) {
3298			kmem_cache_free(btrfs_free_space_bitmap_cachep,
3299					entry->bitmap);
3300			ctl->total_bitmaps--;
3301			recalculate_thresholds(ctl);
3302		} else if (!btrfs_free_space_trimmed(entry)) {
3303			ctl->discardable_extents[BTRFS_STAT_CURR]--;
3304		}
3305		kmem_cache_free(btrfs_free_space_cachep, entry);
3306	}
3307
3308	spin_unlock(&cluster->lock);
3309	spin_unlock(&ctl->tree_lock);
3310
3311	return ret;
3312}
3313
3314static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3315				struct btrfs_free_space *entry,
3316				struct btrfs_free_cluster *cluster,
3317				u64 offset, u64 bytes,
3318				u64 cont1_bytes, u64 min_bytes)
3319{
3320	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3321	unsigned long next_zero;
3322	unsigned long i;
3323	unsigned long want_bits;
3324	unsigned long min_bits;
3325	unsigned long found_bits;
3326	unsigned long max_bits = 0;
3327	unsigned long start = 0;
3328	unsigned long total_found = 0;
3329	int ret;
3330
3331	lockdep_assert_held(&ctl->tree_lock);
3332
3333	i = offset_to_bit(entry->offset, ctl->unit,
3334			  max_t(u64, offset, entry->offset));
3335	want_bits = bytes_to_bits(bytes, ctl->unit);
3336	min_bits = bytes_to_bits(min_bytes, ctl->unit);
3337
3338	/*
3339	 * Don't bother looking for a cluster in this bitmap if it's heavily
3340	 * fragmented.
3341	 */
3342	if (entry->max_extent_size &&
3343	    entry->max_extent_size < cont1_bytes)
3344		return -ENOSPC;
3345again:
3346	found_bits = 0;
3347	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3348		next_zero = find_next_zero_bit(entry->bitmap,
3349					       BITS_PER_BITMAP, i);
3350		if (next_zero - i >= min_bits) {
3351			found_bits = next_zero - i;
3352			if (found_bits > max_bits)
3353				max_bits = found_bits;
3354			break;
3355		}
3356		if (next_zero - i > max_bits)
3357			max_bits = next_zero - i;
3358		i = next_zero;
3359	}
3360
3361	if (!found_bits) {
3362		entry->max_extent_size = (u64)max_bits * ctl->unit;
3363		return -ENOSPC;
3364	}
3365
3366	if (!total_found) {
3367		start = i;
3368		cluster->max_size = 0;
3369	}
3370
3371	total_found += found_bits;
3372
3373	if (cluster->max_size < found_bits * ctl->unit)
3374		cluster->max_size = found_bits * ctl->unit;
3375
3376	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3377		i = next_zero + 1;
3378		goto again;
3379	}
3380
3381	cluster->window_start = start * ctl->unit + entry->offset;
3382	rb_erase(&entry->offset_index, &ctl->free_space_offset);
3383	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3384
3385	/*
3386	 * We need to know if we're currently on the normal space index when we
3387	 * manipulate the bitmap so that we know we need to remove and re-insert
3388	 * it into the space_index tree.  Clear the bytes_index node here so the
3389	 * bitmap manipulation helpers know not to mess with the space_index
3390	 * until this bitmap entry is added back into the normal cache.
3391	 */
3392	RB_CLEAR_NODE(&entry->bytes_index);
3393
3394	ret = tree_insert_offset(ctl, cluster, entry);
3395	ASSERT(!ret); /* -EEXIST; Logic error */
3396
3397	trace_btrfs_setup_cluster(block_group, cluster,
3398				  total_found * ctl->unit, 1);
3399	return 0;
3400}
3401
3402/*
3403 * This searches the block group for just extents to fill the cluster with.
3404 * Try to find a cluster with at least bytes total bytes, at least one
3405 * extent of cont1_bytes, and other clusters of at least min_bytes.
3406 */
3407static noinline int
3408setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3409			struct btrfs_free_cluster *cluster,
3410			struct list_head *bitmaps, u64 offset, u64 bytes,
3411			u64 cont1_bytes, u64 min_bytes)
3412{
3413	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3414	struct btrfs_free_space *first = NULL;
3415	struct btrfs_free_space *entry = NULL;
3416	struct btrfs_free_space *last;
3417	struct rb_node *node;
3418	u64 window_free;
3419	u64 max_extent;
3420	u64 total_size = 0;
3421
3422	lockdep_assert_held(&ctl->tree_lock);
3423
3424	entry = tree_search_offset(ctl, offset, 0, 1);
3425	if (!entry)
3426		return -ENOSPC;
3427
3428	/*
3429	 * We don't want bitmaps, so just move along until we find a normal
3430	 * extent entry.
3431	 */
3432	while (entry->bitmap || entry->bytes < min_bytes) {
3433		if (entry->bitmap && list_empty(&entry->list))
3434			list_add_tail(&entry->list, bitmaps);
3435		node = rb_next(&entry->offset_index);
3436		if (!node)
3437			return -ENOSPC;
3438		entry = rb_entry(node, struct btrfs_free_space, offset_index);
3439	}
3440
3441	window_free = entry->bytes;
3442	max_extent = entry->bytes;
3443	first = entry;
3444	last = entry;
3445
3446	for (node = rb_next(&entry->offset_index); node;
3447	     node = rb_next(&entry->offset_index)) {
3448		entry = rb_entry(node, struct btrfs_free_space, offset_index);
3449
3450		if (entry->bitmap) {
3451			if (list_empty(&entry->list))
3452				list_add_tail(&entry->list, bitmaps);
3453			continue;
3454		}
3455
3456		if (entry->bytes < min_bytes)
3457			continue;
3458
3459		last = entry;
3460		window_free += entry->bytes;
3461		if (entry->bytes > max_extent)
3462			max_extent = entry->bytes;
3463	}
3464
3465	if (window_free < bytes || max_extent < cont1_bytes)
3466		return -ENOSPC;
3467
3468	cluster->window_start = first->offset;
3469
3470	node = &first->offset_index;
3471
3472	/*
3473	 * now we've found our entries, pull them out of the free space
3474	 * cache and put them into the cluster rbtree
3475	 */
3476	do {
3477		int ret;
3478
3479		entry = rb_entry(node, struct btrfs_free_space, offset_index);
3480		node = rb_next(&entry->offset_index);
3481		if (entry->bitmap || entry->bytes < min_bytes)
3482			continue;
3483
3484		rb_erase(&entry->offset_index, &ctl->free_space_offset);
3485		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3486		ret = tree_insert_offset(ctl, cluster, entry);
3487		total_size += entry->bytes;
3488		ASSERT(!ret); /* -EEXIST; Logic error */
3489	} while (node && entry != last);
3490
3491	cluster->max_size = max_extent;
3492	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3493	return 0;
3494}
3495
3496/*
3497 * This specifically looks for bitmaps that may work in the cluster, we assume
3498 * that we have already failed to find extents that will work.
3499 */
3500static noinline int
3501setup_cluster_bitmap(struct btrfs_block_group *block_group,
3502		     struct btrfs_free_cluster *cluster,
3503		     struct list_head *bitmaps, u64 offset, u64 bytes,
3504		     u64 cont1_bytes, u64 min_bytes)
3505{
3506	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3507	struct btrfs_free_space *entry = NULL;
3508	int ret = -ENOSPC;
3509	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3510
3511	if (ctl->total_bitmaps == 0)
3512		return -ENOSPC;
3513
3514	/*
3515	 * The bitmap that covers offset won't be in the list unless offset
3516	 * is just its start offset.
3517	 */
3518	if (!list_empty(bitmaps))
3519		entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3520
3521	if (!entry || entry->offset != bitmap_offset) {
3522		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3523		if (entry && list_empty(&entry->list))
3524			list_add(&entry->list, bitmaps);
3525	}
3526
3527	list_for_each_entry(entry, bitmaps, list) {
3528		if (entry->bytes < bytes)
3529			continue;
3530		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3531					   bytes, cont1_bytes, min_bytes);
3532		if (!ret)
3533			return 0;
3534	}
3535
3536	/*
3537	 * The bitmaps list has all the bitmaps that record free space
3538	 * starting after offset, so no more search is required.
3539	 */
3540	return -ENOSPC;
3541}
3542
3543/*
3544 * here we try to find a cluster of blocks in a block group.  The goal
3545 * is to find at least bytes+empty_size.
3546 * We might not find them all in one contiguous area.
3547 *
3548 * returns zero and sets up cluster if things worked out, otherwise
3549 * it returns -enospc
3550 */
3551int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3552			     struct btrfs_free_cluster *cluster,
3553			     u64 offset, u64 bytes, u64 empty_size)
3554{
3555	struct btrfs_fs_info *fs_info = block_group->fs_info;
3556	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3557	struct btrfs_free_space *entry, *tmp;
3558	LIST_HEAD(bitmaps);
3559	u64 min_bytes;
3560	u64 cont1_bytes;
3561	int ret;
3562
3563	/*
3564	 * Choose the minimum extent size we'll require for this
3565	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3566	 * For metadata, allow allocates with smaller extents.  For
3567	 * data, keep it dense.
3568	 */
3569	if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3570		cont1_bytes = bytes + empty_size;
3571		min_bytes = cont1_bytes;
3572	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3573		cont1_bytes = bytes;
3574		min_bytes = fs_info->sectorsize;
3575	} else {
3576		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3577		min_bytes = fs_info->sectorsize;
3578	}
3579
3580	spin_lock(&ctl->tree_lock);
3581
3582	/*
3583	 * If we know we don't have enough space to make a cluster don't even
3584	 * bother doing all the work to try and find one.
3585	 */
3586	if (ctl->free_space < bytes) {
3587		spin_unlock(&ctl->tree_lock);
3588		return -ENOSPC;
3589	}
3590
3591	spin_lock(&cluster->lock);
3592
3593	/* someone already found a cluster, hooray */
3594	if (cluster->block_group) {
3595		ret = 0;
3596		goto out;
3597	}
3598
3599	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3600				 min_bytes);
3601
3602	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3603				      bytes + empty_size,
3604				      cont1_bytes, min_bytes);
3605	if (ret)
3606		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3607					   offset, bytes + empty_size,
3608					   cont1_bytes, min_bytes);
3609
3610	/* Clear our temporary list */
3611	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3612		list_del_init(&entry->list);
3613
3614	if (!ret) {
3615		btrfs_get_block_group(block_group);
3616		list_add_tail(&cluster->block_group_list,
3617			      &block_group->cluster_list);
3618		cluster->block_group = block_group;
3619	} else {
3620		trace_btrfs_failed_cluster_setup(block_group);
3621	}
3622out:
3623	spin_unlock(&cluster->lock);
3624	spin_unlock(&ctl->tree_lock);
3625
3626	return ret;
3627}
3628
3629/*
3630 * simple code to zero out a cluster
3631 */
3632void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3633{
3634	spin_lock_init(&cluster->lock);
3635	spin_lock_init(&cluster->refill_lock);
3636	cluster->root = RB_ROOT;
3637	cluster->max_size = 0;
3638	cluster->fragmented = false;
3639	INIT_LIST_HEAD(&cluster->block_group_list);
3640	cluster->block_group = NULL;
3641}
3642
3643static int do_trimming(struct btrfs_block_group *block_group,
3644		       u64 *total_trimmed, u64 start, u64 bytes,
3645		       u64 reserved_start, u64 reserved_bytes,
3646		       enum btrfs_trim_state reserved_trim_state,
3647		       struct btrfs_trim_range *trim_entry)
3648{
3649	struct btrfs_space_info *space_info = block_group->space_info;
3650	struct btrfs_fs_info *fs_info = block_group->fs_info;
3651	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3652	int ret;
3653	int update = 0;
3654	const u64 end = start + bytes;
3655	const u64 reserved_end = reserved_start + reserved_bytes;
3656	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3657	u64 trimmed = 0;
3658
3659	spin_lock(&space_info->lock);
3660	spin_lock(&block_group->lock);
3661	if (!block_group->ro) {
3662		block_group->reserved += reserved_bytes;
3663		space_info->bytes_reserved += reserved_bytes;
3664		update = 1;
3665	}
3666	spin_unlock(&block_group->lock);
3667	spin_unlock(&space_info->lock);
3668
3669	ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3670	if (!ret) {
3671		*total_trimmed += trimmed;
3672		trim_state = BTRFS_TRIM_STATE_TRIMMED;
3673	}
3674
3675	mutex_lock(&ctl->cache_writeout_mutex);
3676	if (reserved_start < start)
3677		__btrfs_add_free_space(block_group, reserved_start,
3678				       start - reserved_start,
3679				       reserved_trim_state);
3680	if (end < reserved_end)
3681		__btrfs_add_free_space(block_group, end, reserved_end - end,
3682				       reserved_trim_state);
3683	__btrfs_add_free_space(block_group, start, bytes, trim_state);
3684	list_del(&trim_entry->list);
3685	mutex_unlock(&ctl->cache_writeout_mutex);
3686
3687	if (update) {
3688		spin_lock(&space_info->lock);
3689		spin_lock(&block_group->lock);
3690		if (block_group->ro)
3691			space_info->bytes_readonly += reserved_bytes;
3692		block_group->reserved -= reserved_bytes;
3693		space_info->bytes_reserved -= reserved_bytes;
3694		spin_unlock(&block_group->lock);
3695		spin_unlock(&space_info->lock);
3696	}
3697
3698	return ret;
3699}
3700
3701/*
3702 * If @async is set, then we will trim 1 region and return.
3703 */
3704static int trim_no_bitmap(struct btrfs_block_group *block_group,
3705			  u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3706			  bool async)
3707{
3708	struct btrfs_discard_ctl *discard_ctl =
3709					&block_group->fs_info->discard_ctl;
3710	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3711	struct btrfs_free_space *entry;
3712	struct rb_node *node;
3713	int ret = 0;
3714	u64 extent_start;
3715	u64 extent_bytes;
3716	enum btrfs_trim_state extent_trim_state;
3717	u64 bytes;
3718	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3719
3720	while (start < end) {
3721		struct btrfs_trim_range trim_entry;
3722
3723		mutex_lock(&ctl->cache_writeout_mutex);
3724		spin_lock(&ctl->tree_lock);
3725
3726		if (ctl->free_space < minlen)
3727			goto out_unlock;
3728
3729		entry = tree_search_offset(ctl, start, 0, 1);
3730		if (!entry)
3731			goto out_unlock;
3732
3733		/* Skip bitmaps and if async, already trimmed entries */
3734		while (entry->bitmap ||
3735		       (async && btrfs_free_space_trimmed(entry))) {
3736			node = rb_next(&entry->offset_index);
3737			if (!node)
3738				goto out_unlock;
3739			entry = rb_entry(node, struct btrfs_free_space,
3740					 offset_index);
3741		}
3742
3743		if (entry->offset >= end)
3744			goto out_unlock;
3745
3746		extent_start = entry->offset;
3747		extent_bytes = entry->bytes;
3748		extent_trim_state = entry->trim_state;
3749		if (async) {
3750			start = entry->offset;
3751			bytes = entry->bytes;
3752			if (bytes < minlen) {
3753				spin_unlock(&ctl->tree_lock);
3754				mutex_unlock(&ctl->cache_writeout_mutex);
3755				goto next;
3756			}
3757			unlink_free_space(ctl, entry, true);
3758			/*
3759			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3760			 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3761			 * X when we come back around.  So trim it now.
3762			 */
3763			if (max_discard_size &&
3764			    bytes >= (max_discard_size +
3765				      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3766				bytes = max_discard_size;
3767				extent_bytes = max_discard_size;
3768				entry->offset += max_discard_size;
3769				entry->bytes -= max_discard_size;
3770				link_free_space(ctl, entry);
3771			} else {
3772				kmem_cache_free(btrfs_free_space_cachep, entry);
3773			}
3774		} else {
3775			start = max(start, extent_start);
3776			bytes = min(extent_start + extent_bytes, end) - start;
3777			if (bytes < minlen) {
3778				spin_unlock(&ctl->tree_lock);
3779				mutex_unlock(&ctl->cache_writeout_mutex);
3780				goto next;
3781			}
3782
3783			unlink_free_space(ctl, entry, true);
3784			kmem_cache_free(btrfs_free_space_cachep, entry);
3785		}
3786
3787		spin_unlock(&ctl->tree_lock);
3788		trim_entry.start = extent_start;
3789		trim_entry.bytes = extent_bytes;
3790		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3791		mutex_unlock(&ctl->cache_writeout_mutex);
3792
3793		ret = do_trimming(block_group, total_trimmed, start, bytes,
3794				  extent_start, extent_bytes, extent_trim_state,
3795				  &trim_entry);
3796		if (ret) {
3797			block_group->discard_cursor = start + bytes;
3798			break;
3799		}
3800next:
3801		start += bytes;
3802		block_group->discard_cursor = start;
3803		if (async && *total_trimmed)
3804			break;
3805
3806		if (fatal_signal_pending(current)) {
3807			ret = -ERESTARTSYS;
3808			break;
3809		}
3810
3811		cond_resched();
3812	}
3813
3814	return ret;
3815
3816out_unlock:
3817	block_group->discard_cursor = btrfs_block_group_end(block_group);
3818	spin_unlock(&ctl->tree_lock);
3819	mutex_unlock(&ctl->cache_writeout_mutex);
3820
3821	return ret;
3822}
3823
3824/*
3825 * If we break out of trimming a bitmap prematurely, we should reset the
3826 * trimming bit.  In a rather contrieved case, it's possible to race here so
3827 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3828 *
3829 * start = start of bitmap
3830 * end = near end of bitmap
3831 *
3832 * Thread 1:			Thread 2:
3833 * trim_bitmaps(start)
3834 *				trim_bitmaps(end)
3835 *				end_trimming_bitmap()
3836 * reset_trimming_bitmap()
3837 */
3838static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3839{
3840	struct btrfs_free_space *entry;
3841
3842	spin_lock(&ctl->tree_lock);
3843	entry = tree_search_offset(ctl, offset, 1, 0);
3844	if (entry) {
3845		if (btrfs_free_space_trimmed(entry)) {
3846			ctl->discardable_extents[BTRFS_STAT_CURR] +=
3847				entry->bitmap_extents;
3848			ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3849		}
3850		entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3851	}
3852
3853	spin_unlock(&ctl->tree_lock);
3854}
3855
3856static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3857				struct btrfs_free_space *entry)
3858{
3859	if (btrfs_free_space_trimming_bitmap(entry)) {
3860		entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3861		ctl->discardable_extents[BTRFS_STAT_CURR] -=
3862			entry->bitmap_extents;
3863		ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3864	}
3865}
3866
3867/*
3868 * If @async is set, then we will trim 1 region and return.
3869 */
3870static int trim_bitmaps(struct btrfs_block_group *block_group,
3871			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3872			u64 maxlen, bool async)
3873{
3874	struct btrfs_discard_ctl *discard_ctl =
3875					&block_group->fs_info->discard_ctl;
3876	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3877	struct btrfs_free_space *entry;
3878	int ret = 0;
3879	int ret2;
3880	u64 bytes;
3881	u64 offset = offset_to_bitmap(ctl, start);
3882	const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3883
3884	while (offset < end) {
3885		bool next_bitmap = false;
3886		struct btrfs_trim_range trim_entry;
3887
3888		mutex_lock(&ctl->cache_writeout_mutex);
3889		spin_lock(&ctl->tree_lock);
3890
3891		if (ctl->free_space < minlen) {
3892			block_group->discard_cursor =
3893				btrfs_block_group_end(block_group);
3894			spin_unlock(&ctl->tree_lock);
3895			mutex_unlock(&ctl->cache_writeout_mutex);
3896			break;
3897		}
3898
3899		entry = tree_search_offset(ctl, offset, 1, 0);
3900		/*
3901		 * Bitmaps are marked trimmed lossily now to prevent constant
3902		 * discarding of the same bitmap (the reason why we are bound
3903		 * by the filters).  So, retrim the block group bitmaps when we
3904		 * are preparing to punt to the unused_bgs list.  This uses
3905		 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3906		 * which is the only discard index which sets minlen to 0.
3907		 */
3908		if (!entry || (async && minlen && start == offset &&
3909			       btrfs_free_space_trimmed(entry))) {
3910			spin_unlock(&ctl->tree_lock);
3911			mutex_unlock(&ctl->cache_writeout_mutex);
3912			next_bitmap = true;
3913			goto next;
3914		}
3915
3916		/*
3917		 * Async discard bitmap trimming begins at by setting the start
3918		 * to be key.objectid and the offset_to_bitmap() aligns to the
3919		 * start of the bitmap.  This lets us know we are fully
3920		 * scanning the bitmap rather than only some portion of it.
3921		 */
3922		if (start == offset)
3923			entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3924
3925		bytes = minlen;
3926		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3927		if (ret2 || start >= end) {
3928			/*
3929			 * We lossily consider a bitmap trimmed if we only skip
3930			 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3931			 */
3932			if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3933				end_trimming_bitmap(ctl, entry);
3934			else
3935				entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3936			spin_unlock(&ctl->tree_lock);
3937			mutex_unlock(&ctl->cache_writeout_mutex);
3938			next_bitmap = true;
3939			goto next;
3940		}
3941
3942		/*
3943		 * We already trimmed a region, but are using the locking above
3944		 * to reset the trim_state.
3945		 */
3946		if (async && *total_trimmed) {
3947			spin_unlock(&ctl->tree_lock);
3948			mutex_unlock(&ctl->cache_writeout_mutex);
3949			goto out;
3950		}
3951
3952		bytes = min(bytes, end - start);
3953		if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3954			spin_unlock(&ctl->tree_lock);
3955			mutex_unlock(&ctl->cache_writeout_mutex);
3956			goto next;
3957		}
3958
3959		/*
3960		 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3961		 * If X < @minlen, we won't trim X when we come back around.
3962		 * So trim it now.  We differ here from trimming extents as we
3963		 * don't keep individual state per bit.
3964		 */
3965		if (async &&
3966		    max_discard_size &&
3967		    bytes > (max_discard_size + minlen))
3968			bytes = max_discard_size;
3969
3970		bitmap_clear_bits(ctl, entry, start, bytes, true);
3971		if (entry->bytes == 0)
3972			free_bitmap(ctl, entry);
3973
3974		spin_unlock(&ctl->tree_lock);
3975		trim_entry.start = start;
3976		trim_entry.bytes = bytes;
3977		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3978		mutex_unlock(&ctl->cache_writeout_mutex);
3979
3980		ret = do_trimming(block_group, total_trimmed, start, bytes,
3981				  start, bytes, 0, &trim_entry);
3982		if (ret) {
3983			reset_trimming_bitmap(ctl, offset);
3984			block_group->discard_cursor =
3985				btrfs_block_group_end(block_group);
3986			break;
3987		}
3988next:
3989		if (next_bitmap) {
3990			offset += BITS_PER_BITMAP * ctl->unit;
3991			start = offset;
3992		} else {
3993			start += bytes;
3994		}
3995		block_group->discard_cursor = start;
3996
3997		if (fatal_signal_pending(current)) {
3998			if (start != offset)
3999				reset_trimming_bitmap(ctl, offset);
4000			ret = -ERESTARTSYS;
4001			break;
4002		}
4003
4004		cond_resched();
4005	}
4006
4007	if (offset >= end)
4008		block_group->discard_cursor = end;
4009
4010out:
4011	return ret;
4012}
4013
4014int btrfs_trim_block_group(struct btrfs_block_group *block_group,
4015			   u64 *trimmed, u64 start, u64 end, u64 minlen)
4016{
4017	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
4018	int ret;
4019	u64 rem = 0;
4020
4021	ASSERT(!btrfs_is_zoned(block_group->fs_info));
4022
4023	*trimmed = 0;
4024
4025	spin_lock(&block_group->lock);
4026	if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
4027		spin_unlock(&block_group->lock);
4028		return 0;
4029	}
4030	btrfs_freeze_block_group(block_group);
4031	spin_unlock(&block_group->lock);
4032
4033	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
4034	if (ret)
4035		goto out;
4036
4037	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
4038	div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
4039	/* If we ended in the middle of a bitmap, reset the trimming flag */
4040	if (rem)
4041		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
4042out:
4043	btrfs_unfreeze_block_group(block_group);
4044	return ret;
4045}
4046
4047int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
4048				   u64 *trimmed, u64 start, u64 end, u64 minlen,
4049				   bool async)
4050{
4051	int ret;
4052
4053	*trimmed = 0;
4054
4055	spin_lock(&block_group->lock);
4056	if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
4057		spin_unlock(&block_group->lock);
4058		return 0;
4059	}
4060	btrfs_freeze_block_group(block_group);
4061	spin_unlock(&block_group->lock);
4062
4063	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
4064	btrfs_unfreeze_block_group(block_group);
4065
4066	return ret;
4067}
4068
4069int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
4070				   u64 *trimmed, u64 start, u64 end, u64 minlen,
4071				   u64 maxlen, bool async)
4072{
4073	int ret;
4074
4075	*trimmed = 0;
4076
4077	spin_lock(&block_group->lock);
4078	if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
4079		spin_unlock(&block_group->lock);
4080		return 0;
4081	}
4082	btrfs_freeze_block_group(block_group);
4083	spin_unlock(&block_group->lock);
4084
4085	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
4086			   async);
4087
4088	btrfs_unfreeze_block_group(block_group);
4089
4090	return ret;
4091}
4092
4093bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
4094{
4095	return btrfs_super_cache_generation(fs_info->super_copy);
4096}
4097
4098static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
4099				       struct btrfs_trans_handle *trans)
4100{
4101	struct btrfs_block_group *block_group;
4102	struct rb_node *node;
4103	int ret = 0;
4104
4105	btrfs_info(fs_info, "cleaning free space cache v1");
4106
4107	node = rb_first_cached(&fs_info->block_group_cache_tree);
4108	while (node) {
4109		block_group = rb_entry(node, struct btrfs_block_group, cache_node);
4110		ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
4111		if (ret)
4112			goto out;
4113		node = rb_next(node);
4114	}
4115out:
4116	return ret;
4117}
4118
4119int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
4120{
4121	struct btrfs_trans_handle *trans;
4122	int ret;
4123
4124	/*
4125	 * update_super_roots will appropriately set or unset
4126	 * super_copy->cache_generation based on SPACE_CACHE and
4127	 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4128	 * transaction commit whether we are enabling space cache v1 and don't
4129	 * have any other work to do, or are disabling it and removing free
4130	 * space inodes.
4131	 */
4132	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4133	if (IS_ERR(trans))
4134		return PTR_ERR(trans);
4135
4136	if (!active) {
4137		set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4138		ret = cleanup_free_space_cache_v1(fs_info, trans);
4139		if (ret) {
4140			btrfs_abort_transaction(trans, ret);
4141			btrfs_end_transaction(trans);
4142			goto out;
4143		}
4144	}
4145
4146	ret = btrfs_commit_transaction(trans);
4147out:
4148	clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4149
4150	return ret;
4151}
4152
4153int __init btrfs_free_space_init(void)
4154{
4155	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
4156			sizeof(struct btrfs_free_space), 0,
4157			SLAB_MEM_SPREAD, NULL);
4158	if (!btrfs_free_space_cachep)
4159		return -ENOMEM;
4160
4161	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
4162							PAGE_SIZE, PAGE_SIZE,
4163							SLAB_MEM_SPREAD, NULL);
4164	if (!btrfs_free_space_bitmap_cachep) {
4165		kmem_cache_destroy(btrfs_free_space_cachep);
4166		return -ENOMEM;
4167	}
4168
4169	return 0;
4170}
4171
4172void __cold btrfs_free_space_exit(void)
4173{
4174	kmem_cache_destroy(btrfs_free_space_cachep);
4175	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
4176}
4177
4178#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4179/*
4180 * Use this if you need to make a bitmap or extent entry specifically, it
4181 * doesn't do any of the merging that add_free_space does, this acts a lot like
4182 * how the free space cache loading stuff works, so you can get really weird
4183 * configurations.
4184 */
4185int test_add_free_space_entry(struct btrfs_block_group *cache,
4186			      u64 offset, u64 bytes, bool bitmap)
4187{
4188	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4189	struct btrfs_free_space *info = NULL, *bitmap_info;
4190	void *map = NULL;
4191	enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4192	u64 bytes_added;
4193	int ret;
4194
4195again:
4196	if (!info) {
4197		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4198		if (!info)
4199			return -ENOMEM;
4200	}
4201
4202	if (!bitmap) {
4203		spin_lock(&ctl->tree_lock);
4204		info->offset = offset;
4205		info->bytes = bytes;
4206		info->max_extent_size = 0;
4207		ret = link_free_space(ctl, info);
4208		spin_unlock(&ctl->tree_lock);
4209		if (ret)
4210			kmem_cache_free(btrfs_free_space_cachep, info);
4211		return ret;
4212	}
4213
4214	if (!map) {
4215		map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4216		if (!map) {
4217			kmem_cache_free(btrfs_free_space_cachep, info);
4218			return -ENOMEM;
4219		}
4220	}
4221
4222	spin_lock(&ctl->tree_lock);
4223	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4224					 1, 0);
4225	if (!bitmap_info) {
4226		info->bitmap = map;
4227		map = NULL;
4228		add_new_bitmap(ctl, info, offset);
4229		bitmap_info = info;
4230		info = NULL;
4231	}
4232
4233	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4234					  trim_state);
4235
4236	bytes -= bytes_added;
4237	offset += bytes_added;
4238	spin_unlock(&ctl->tree_lock);
4239
4240	if (bytes)
4241		goto again;
4242
4243	if (info)
4244		kmem_cache_free(btrfs_free_space_cachep, info);
4245	if (map)
4246		kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4247	return 0;
4248}
4249
4250/*
4251 * Checks to see if the given range is in the free space cache.  This is really
4252 * just used to check the absence of space, so if there is free space in the
4253 * range at all we will return 1.
4254 */
4255int test_check_exists(struct btrfs_block_group *cache,
4256		      u64 offset, u64 bytes)
4257{
4258	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4259	struct btrfs_free_space *info;
4260	int ret = 0;
4261
4262	spin_lock(&ctl->tree_lock);
4263	info = tree_search_offset(ctl, offset, 0, 0);
4264	if (!info) {
4265		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4266					  1, 0);
4267		if (!info)
4268			goto out;
4269	}
4270
4271have_info:
4272	if (info->bitmap) {
4273		u64 bit_off, bit_bytes;
4274		struct rb_node *n;
4275		struct btrfs_free_space *tmp;
4276
4277		bit_off = offset;
4278		bit_bytes = ctl->unit;
4279		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4280		if (!ret) {
4281			if (bit_off == offset) {
4282				ret = 1;
4283				goto out;
4284			} else if (bit_off > offset &&
4285				   offset + bytes > bit_off) {
4286				ret = 1;
4287				goto out;
4288			}
4289		}
4290
4291		n = rb_prev(&info->offset_index);
4292		while (n) {
4293			tmp = rb_entry(n, struct btrfs_free_space,
4294				       offset_index);
4295			if (tmp->offset + tmp->bytes < offset)
4296				break;
4297			if (offset + bytes < tmp->offset) {
4298				n = rb_prev(&tmp->offset_index);
4299				continue;
4300			}
4301			info = tmp;
4302			goto have_info;
4303		}
4304
4305		n = rb_next(&info->offset_index);
4306		while (n) {
4307			tmp = rb_entry(n, struct btrfs_free_space,
4308				       offset_index);
4309			if (offset + bytes < tmp->offset)
4310				break;
4311			if (tmp->offset + tmp->bytes < offset) {
4312				n = rb_next(&tmp->offset_index);
4313				continue;
4314			}
4315			info = tmp;
4316			goto have_info;
4317		}
4318
4319		ret = 0;
4320		goto out;
4321	}
4322
4323	if (info->offset == offset) {
4324		ret = 1;
4325		goto out;
4326	}
4327
4328	if (offset > info->offset && offset < info->offset + info->bytes)
4329		ret = 1;
4330out:
4331	spin_unlock(&ctl->tree_lock);
4332	return ret;
4333}
4334#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
4335