xref: /kernel/linux/linux-6.6/fs/btrfs/inode.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <crypto/hash.h>
7#include <linux/kernel.h>
8#include <linux/bio.h>
9#include <linux/blk-cgroup.h>
10#include <linux/file.h>
11#include <linux/fs.h>
12#include <linux/pagemap.h>
13#include <linux/highmem.h>
14#include <linux/time.h>
15#include <linux/init.h>
16#include <linux/string.h>
17#include <linux/backing-dev.h>
18#include <linux/writeback.h>
19#include <linux/compat.h>
20#include <linux/xattr.h>
21#include <linux/posix_acl.h>
22#include <linux/falloc.h>
23#include <linux/slab.h>
24#include <linux/ratelimit.h>
25#include <linux/btrfs.h>
26#include <linux/blkdev.h>
27#include <linux/posix_acl_xattr.h>
28#include <linux/uio.h>
29#include <linux/magic.h>
30#include <linux/iversion.h>
31#include <linux/swap.h>
32#include <linux/migrate.h>
33#include <linux/sched/mm.h>
34#include <linux/iomap.h>
35#include <asm/unaligned.h>
36#include <linux/fsverity.h>
37#include "misc.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "print-tree.h"
43#include "ordered-data.h"
44#include "xattr.h"
45#include "tree-log.h"
46#include "bio.h"
47#include "compression.h"
48#include "locking.h"
49#include "free-space-cache.h"
50#include "props.h"
51#include "qgroup.h"
52#include "delalloc-space.h"
53#include "block-group.h"
54#include "space-info.h"
55#include "zoned.h"
56#include "subpage.h"
57#include "inode-item.h"
58#include "fs.h"
59#include "accessors.h"
60#include "extent-tree.h"
61#include "root-tree.h"
62#include "defrag.h"
63#include "dir-item.h"
64#include "file-item.h"
65#include "uuid-tree.h"
66#include "ioctl.h"
67#include "file.h"
68#include "acl.h"
69#include "relocation.h"
70#include "verity.h"
71#include "super.h"
72#include "orphan.h"
73#include "backref.h"
74
75struct btrfs_iget_args {
76	u64 ino;
77	struct btrfs_root *root;
78};
79
80struct btrfs_dio_data {
81	ssize_t submitted;
82	struct extent_changeset *data_reserved;
83	struct btrfs_ordered_extent *ordered;
84	bool data_space_reserved;
85	bool nocow_done;
86};
87
88struct btrfs_dio_private {
89	/* Range of I/O */
90	u64 file_offset;
91	u32 bytes;
92
93	/* This must be last */
94	struct btrfs_bio bbio;
95};
96
97static struct bio_set btrfs_dio_bioset;
98
99struct btrfs_rename_ctx {
100	/* Output field. Stores the index number of the old directory entry. */
101	u64 index;
102};
103
104/*
105 * Used by data_reloc_print_warning_inode() to pass needed info for filename
106 * resolution and output of error message.
107 */
108struct data_reloc_warn {
109	struct btrfs_path path;
110	struct btrfs_fs_info *fs_info;
111	u64 extent_item_size;
112	u64 logical;
113	int mirror_num;
114};
115
116static const struct inode_operations btrfs_dir_inode_operations;
117static const struct inode_operations btrfs_symlink_inode_operations;
118static const struct inode_operations btrfs_special_inode_operations;
119static const struct inode_operations btrfs_file_inode_operations;
120static const struct address_space_operations btrfs_aops;
121static const struct file_operations btrfs_dir_file_operations;
122
123static struct kmem_cache *btrfs_inode_cachep;
124
125static int btrfs_setsize(struct inode *inode, struct iattr *attr);
126static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
127
128static noinline int run_delalloc_cow(struct btrfs_inode *inode,
129				     struct page *locked_page, u64 start,
130				     u64 end, struct writeback_control *wbc,
131				     bool pages_dirty);
132static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
133				       u64 len, u64 orig_start, u64 block_start,
134				       u64 block_len, u64 orig_block_len,
135				       u64 ram_bytes, int compress_type,
136				       int type);
137
138static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
139					  u64 root, void *warn_ctx)
140{
141	struct data_reloc_warn *warn = warn_ctx;
142	struct btrfs_fs_info *fs_info = warn->fs_info;
143	struct extent_buffer *eb;
144	struct btrfs_inode_item *inode_item;
145	struct inode_fs_paths *ipath = NULL;
146	struct btrfs_root *local_root;
147	struct btrfs_key key;
148	unsigned int nofs_flag;
149	u32 nlink;
150	int ret;
151
152	local_root = btrfs_get_fs_root(fs_info, root, true);
153	if (IS_ERR(local_root)) {
154		ret = PTR_ERR(local_root);
155		goto err;
156	}
157
158	/* This makes the path point to (inum INODE_ITEM ioff). */
159	key.objectid = inum;
160	key.type = BTRFS_INODE_ITEM_KEY;
161	key.offset = 0;
162
163	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
164	if (ret) {
165		btrfs_put_root(local_root);
166		btrfs_release_path(&warn->path);
167		goto err;
168	}
169
170	eb = warn->path.nodes[0];
171	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
172	nlink = btrfs_inode_nlink(eb, inode_item);
173	btrfs_release_path(&warn->path);
174
175	nofs_flag = memalloc_nofs_save();
176	ipath = init_ipath(4096, local_root, &warn->path);
177	memalloc_nofs_restore(nofs_flag);
178	if (IS_ERR(ipath)) {
179		btrfs_put_root(local_root);
180		ret = PTR_ERR(ipath);
181		ipath = NULL;
182		/*
183		 * -ENOMEM, not a critical error, just output an generic error
184		 * without filename.
185		 */
186		btrfs_warn(fs_info,
187"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
188			   warn->logical, warn->mirror_num, root, inum, offset);
189		return ret;
190	}
191	ret = paths_from_inode(inum, ipath);
192	if (ret < 0)
193		goto err;
194
195	/*
196	 * We deliberately ignore the bit ipath might have been too small to
197	 * hold all of the paths here
198	 */
199	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
200		btrfs_warn(fs_info,
201"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
202			   warn->logical, warn->mirror_num, root, inum, offset,
203			   fs_info->sectorsize, nlink,
204			   (char *)(unsigned long)ipath->fspath->val[i]);
205	}
206
207	btrfs_put_root(local_root);
208	free_ipath(ipath);
209	return 0;
210
211err:
212	btrfs_warn(fs_info,
213"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
214		   warn->logical, warn->mirror_num, root, inum, offset, ret);
215
216	free_ipath(ipath);
217	return ret;
218}
219
220/*
221 * Do extra user-friendly error output (e.g. lookup all the affected files).
222 *
223 * Return true if we succeeded doing the backref lookup.
224 * Return false if such lookup failed, and has to fallback to the old error message.
225 */
226static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
227				   const u8 *csum, const u8 *csum_expected,
228				   int mirror_num)
229{
230	struct btrfs_fs_info *fs_info = inode->root->fs_info;
231	struct btrfs_path path = { 0 };
232	struct btrfs_key found_key = { 0 };
233	struct extent_buffer *eb;
234	struct btrfs_extent_item *ei;
235	const u32 csum_size = fs_info->csum_size;
236	u64 logical;
237	u64 flags;
238	u32 item_size;
239	int ret;
240
241	mutex_lock(&fs_info->reloc_mutex);
242	logical = btrfs_get_reloc_bg_bytenr(fs_info);
243	mutex_unlock(&fs_info->reloc_mutex);
244
245	if (logical == U64_MAX) {
246		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
247		btrfs_warn_rl(fs_info,
248"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
249			inode->root->root_key.objectid, btrfs_ino(inode), file_off,
250			CSUM_FMT_VALUE(csum_size, csum),
251			CSUM_FMT_VALUE(csum_size, csum_expected),
252			mirror_num);
253		return;
254	}
255
256	logical += file_off;
257	btrfs_warn_rl(fs_info,
258"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
259			inode->root->root_key.objectid,
260			btrfs_ino(inode), file_off, logical,
261			CSUM_FMT_VALUE(csum_size, csum),
262			CSUM_FMT_VALUE(csum_size, csum_expected),
263			mirror_num);
264
265	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
266	if (ret < 0) {
267		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
268			     logical, ret);
269		return;
270	}
271	eb = path.nodes[0];
272	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
273	item_size = btrfs_item_size(eb, path.slots[0]);
274	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
275		unsigned long ptr = 0;
276		u64 ref_root;
277		u8 ref_level;
278
279		while (true) {
280			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
281						      item_size, &ref_root,
282						      &ref_level);
283			if (ret < 0) {
284				btrfs_warn_rl(fs_info,
285				"failed to resolve tree backref for logical %llu: %d",
286					      logical, ret);
287				break;
288			}
289			if (ret > 0)
290				break;
291
292			btrfs_warn_rl(fs_info,
293"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
294				logical, mirror_num,
295				(ref_level ? "node" : "leaf"),
296				ref_level, ref_root);
297		}
298		btrfs_release_path(&path);
299	} else {
300		struct btrfs_backref_walk_ctx ctx = { 0 };
301		struct data_reloc_warn reloc_warn = { 0 };
302
303		btrfs_release_path(&path);
304
305		ctx.bytenr = found_key.objectid;
306		ctx.extent_item_pos = logical - found_key.objectid;
307		ctx.fs_info = fs_info;
308
309		reloc_warn.logical = logical;
310		reloc_warn.extent_item_size = found_key.offset;
311		reloc_warn.mirror_num = mirror_num;
312		reloc_warn.fs_info = fs_info;
313
314		iterate_extent_inodes(&ctx, true,
315				      data_reloc_print_warning_inode, &reloc_warn);
316	}
317}
318
319static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
320		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
321{
322	struct btrfs_root *root = inode->root;
323	const u32 csum_size = root->fs_info->csum_size;
324
325	/* For data reloc tree, it's better to do a backref lookup instead. */
326	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
327		return print_data_reloc_error(inode, logical_start, csum,
328					      csum_expected, mirror_num);
329
330	/* Output without objectid, which is more meaningful */
331	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
332		btrfs_warn_rl(root->fs_info,
333"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
334			root->root_key.objectid, btrfs_ino(inode),
335			logical_start,
336			CSUM_FMT_VALUE(csum_size, csum),
337			CSUM_FMT_VALUE(csum_size, csum_expected),
338			mirror_num);
339	} else {
340		btrfs_warn_rl(root->fs_info,
341"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
342			root->root_key.objectid, btrfs_ino(inode),
343			logical_start,
344			CSUM_FMT_VALUE(csum_size, csum),
345			CSUM_FMT_VALUE(csum_size, csum_expected),
346			mirror_num);
347	}
348}
349
350/*
351 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
352 *
353 * ilock_flags can have the following bit set:
354 *
355 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
356 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
357 *		     return -EAGAIN
358 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
359 */
360int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
361{
362	if (ilock_flags & BTRFS_ILOCK_SHARED) {
363		if (ilock_flags & BTRFS_ILOCK_TRY) {
364			if (!inode_trylock_shared(&inode->vfs_inode))
365				return -EAGAIN;
366			else
367				return 0;
368		}
369		inode_lock_shared(&inode->vfs_inode);
370	} else {
371		if (ilock_flags & BTRFS_ILOCK_TRY) {
372			if (!inode_trylock(&inode->vfs_inode))
373				return -EAGAIN;
374			else
375				return 0;
376		}
377		inode_lock(&inode->vfs_inode);
378	}
379	if (ilock_flags & BTRFS_ILOCK_MMAP)
380		down_write(&inode->i_mmap_lock);
381	return 0;
382}
383
384/*
385 * btrfs_inode_unlock - unock inode i_rwsem
386 *
387 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
388 * to decide whether the lock acquired is shared or exclusive.
389 */
390void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
391{
392	if (ilock_flags & BTRFS_ILOCK_MMAP)
393		up_write(&inode->i_mmap_lock);
394	if (ilock_flags & BTRFS_ILOCK_SHARED)
395		inode_unlock_shared(&inode->vfs_inode);
396	else
397		inode_unlock(&inode->vfs_inode);
398}
399
400/*
401 * Cleanup all submitted ordered extents in specified range to handle errors
402 * from the btrfs_run_delalloc_range() callback.
403 *
404 * NOTE: caller must ensure that when an error happens, it can not call
405 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
406 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
407 * to be released, which we want to happen only when finishing the ordered
408 * extent (btrfs_finish_ordered_io()).
409 */
410static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
411						 struct page *locked_page,
412						 u64 offset, u64 bytes)
413{
414	unsigned long index = offset >> PAGE_SHIFT;
415	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
416	u64 page_start = 0, page_end = 0;
417	struct page *page;
418
419	if (locked_page) {
420		page_start = page_offset(locked_page);
421		page_end = page_start + PAGE_SIZE - 1;
422	}
423
424	while (index <= end_index) {
425		/*
426		 * For locked page, we will call btrfs_mark_ordered_io_finished
427		 * through btrfs_mark_ordered_io_finished() on it
428		 * in run_delalloc_range() for the error handling, which will
429		 * clear page Ordered and run the ordered extent accounting.
430		 *
431		 * Here we can't just clear the Ordered bit, or
432		 * btrfs_mark_ordered_io_finished() would skip the accounting
433		 * for the page range, and the ordered extent will never finish.
434		 */
435		if (locked_page && index == (page_start >> PAGE_SHIFT)) {
436			index++;
437			continue;
438		}
439		page = find_get_page(inode->vfs_inode.i_mapping, index);
440		index++;
441		if (!page)
442			continue;
443
444		/*
445		 * Here we just clear all Ordered bits for every page in the
446		 * range, then btrfs_mark_ordered_io_finished() will handle
447		 * the ordered extent accounting for the range.
448		 */
449		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
450					       offset, bytes);
451		put_page(page);
452	}
453
454	if (locked_page) {
455		/* The locked page covers the full range, nothing needs to be done */
456		if (bytes + offset <= page_start + PAGE_SIZE)
457			return;
458		/*
459		 * In case this page belongs to the delalloc range being
460		 * instantiated then skip it, since the first page of a range is
461		 * going to be properly cleaned up by the caller of
462		 * run_delalloc_range
463		 */
464		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
465			bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
466			offset = page_offset(locked_page) + PAGE_SIZE;
467		}
468	}
469
470	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
471}
472
473static int btrfs_dirty_inode(struct btrfs_inode *inode);
474
475static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
476				     struct btrfs_new_inode_args *args)
477{
478	int err;
479
480	if (args->default_acl) {
481		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
482				      ACL_TYPE_DEFAULT);
483		if (err)
484			return err;
485	}
486	if (args->acl) {
487		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
488		if (err)
489			return err;
490	}
491	if (!args->default_acl && !args->acl)
492		cache_no_acl(args->inode);
493	return btrfs_xattr_security_init(trans, args->inode, args->dir,
494					 &args->dentry->d_name);
495}
496
497/*
498 * this does all the hard work for inserting an inline extent into
499 * the btree.  The caller should have done a btrfs_drop_extents so that
500 * no overlapping inline items exist in the btree
501 */
502static int insert_inline_extent(struct btrfs_trans_handle *trans,
503				struct btrfs_path *path,
504				struct btrfs_inode *inode, bool extent_inserted,
505				size_t size, size_t compressed_size,
506				int compress_type,
507				struct page **compressed_pages,
508				bool update_i_size)
509{
510	struct btrfs_root *root = inode->root;
511	struct extent_buffer *leaf;
512	struct page *page = NULL;
513	char *kaddr;
514	unsigned long ptr;
515	struct btrfs_file_extent_item *ei;
516	int ret;
517	size_t cur_size = size;
518	u64 i_size;
519
520	ASSERT((compressed_size > 0 && compressed_pages) ||
521	       (compressed_size == 0 && !compressed_pages));
522
523	if (compressed_size && compressed_pages)
524		cur_size = compressed_size;
525
526	if (!extent_inserted) {
527		struct btrfs_key key;
528		size_t datasize;
529
530		key.objectid = btrfs_ino(inode);
531		key.offset = 0;
532		key.type = BTRFS_EXTENT_DATA_KEY;
533
534		datasize = btrfs_file_extent_calc_inline_size(cur_size);
535		ret = btrfs_insert_empty_item(trans, root, path, &key,
536					      datasize);
537		if (ret)
538			goto fail;
539	}
540	leaf = path->nodes[0];
541	ei = btrfs_item_ptr(leaf, path->slots[0],
542			    struct btrfs_file_extent_item);
543	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
544	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
545	btrfs_set_file_extent_encryption(leaf, ei, 0);
546	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
547	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
548	ptr = btrfs_file_extent_inline_start(ei);
549
550	if (compress_type != BTRFS_COMPRESS_NONE) {
551		struct page *cpage;
552		int i = 0;
553		while (compressed_size > 0) {
554			cpage = compressed_pages[i];
555			cur_size = min_t(unsigned long, compressed_size,
556				       PAGE_SIZE);
557
558			kaddr = kmap_local_page(cpage);
559			write_extent_buffer(leaf, kaddr, ptr, cur_size);
560			kunmap_local(kaddr);
561
562			i++;
563			ptr += cur_size;
564			compressed_size -= cur_size;
565		}
566		btrfs_set_file_extent_compression(leaf, ei,
567						  compress_type);
568	} else {
569		page = find_get_page(inode->vfs_inode.i_mapping, 0);
570		btrfs_set_file_extent_compression(leaf, ei, 0);
571		kaddr = kmap_local_page(page);
572		write_extent_buffer(leaf, kaddr, ptr, size);
573		kunmap_local(kaddr);
574		put_page(page);
575	}
576	btrfs_mark_buffer_dirty(trans, leaf);
577	btrfs_release_path(path);
578
579	/*
580	 * We align size to sectorsize for inline extents just for simplicity
581	 * sake.
582	 */
583	ret = btrfs_inode_set_file_extent_range(inode, 0,
584					ALIGN(size, root->fs_info->sectorsize));
585	if (ret)
586		goto fail;
587
588	/*
589	 * We're an inline extent, so nobody can extend the file past i_size
590	 * without locking a page we already have locked.
591	 *
592	 * We must do any i_size and inode updates before we unlock the pages.
593	 * Otherwise we could end up racing with unlink.
594	 */
595	i_size = i_size_read(&inode->vfs_inode);
596	if (update_i_size && size > i_size) {
597		i_size_write(&inode->vfs_inode, size);
598		i_size = size;
599	}
600	inode->disk_i_size = i_size;
601
602fail:
603	return ret;
604}
605
606
607/*
608 * conditionally insert an inline extent into the file.  This
609 * does the checks required to make sure the data is small enough
610 * to fit as an inline extent.
611 */
612static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
613					  size_t compressed_size,
614					  int compress_type,
615					  struct page **compressed_pages,
616					  bool update_i_size)
617{
618	struct btrfs_drop_extents_args drop_args = { 0 };
619	struct btrfs_root *root = inode->root;
620	struct btrfs_fs_info *fs_info = root->fs_info;
621	struct btrfs_trans_handle *trans;
622	u64 data_len = (compressed_size ?: size);
623	int ret;
624	struct btrfs_path *path;
625
626	/*
627	 * We can create an inline extent if it ends at or beyond the current
628	 * i_size, is no larger than a sector (decompressed), and the (possibly
629	 * compressed) data fits in a leaf and the configured maximum inline
630	 * size.
631	 */
632	if (size < i_size_read(&inode->vfs_inode) ||
633	    size > fs_info->sectorsize ||
634	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
635	    data_len > fs_info->max_inline)
636		return 1;
637
638	path = btrfs_alloc_path();
639	if (!path)
640		return -ENOMEM;
641
642	trans = btrfs_join_transaction(root);
643	if (IS_ERR(trans)) {
644		btrfs_free_path(path);
645		return PTR_ERR(trans);
646	}
647	trans->block_rsv = &inode->block_rsv;
648
649	drop_args.path = path;
650	drop_args.start = 0;
651	drop_args.end = fs_info->sectorsize;
652	drop_args.drop_cache = true;
653	drop_args.replace_extent = true;
654	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
655	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
656	if (ret) {
657		btrfs_abort_transaction(trans, ret);
658		goto out;
659	}
660
661	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
662				   size, compressed_size, compress_type,
663				   compressed_pages, update_i_size);
664	if (ret && ret != -ENOSPC) {
665		btrfs_abort_transaction(trans, ret);
666		goto out;
667	} else if (ret == -ENOSPC) {
668		ret = 1;
669		goto out;
670	}
671
672	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
673	ret = btrfs_update_inode(trans, root, inode);
674	if (ret && ret != -ENOSPC) {
675		btrfs_abort_transaction(trans, ret);
676		goto out;
677	} else if (ret == -ENOSPC) {
678		ret = 1;
679		goto out;
680	}
681
682	btrfs_set_inode_full_sync(inode);
683out:
684	/*
685	 * Don't forget to free the reserved space, as for inlined extent
686	 * it won't count as data extent, free them directly here.
687	 * And at reserve time, it's always aligned to page size, so
688	 * just free one page here.
689	 */
690	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
691	btrfs_free_path(path);
692	btrfs_end_transaction(trans);
693	return ret;
694}
695
696struct async_extent {
697	u64 start;
698	u64 ram_size;
699	u64 compressed_size;
700	struct page **pages;
701	unsigned long nr_pages;
702	int compress_type;
703	struct list_head list;
704};
705
706struct async_chunk {
707	struct btrfs_inode *inode;
708	struct page *locked_page;
709	u64 start;
710	u64 end;
711	blk_opf_t write_flags;
712	struct list_head extents;
713	struct cgroup_subsys_state *blkcg_css;
714	struct btrfs_work work;
715	struct async_cow *async_cow;
716};
717
718struct async_cow {
719	atomic_t num_chunks;
720	struct async_chunk chunks[];
721};
722
723static noinline int add_async_extent(struct async_chunk *cow,
724				     u64 start, u64 ram_size,
725				     u64 compressed_size,
726				     struct page **pages,
727				     unsigned long nr_pages,
728				     int compress_type)
729{
730	struct async_extent *async_extent;
731
732	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
733	BUG_ON(!async_extent); /* -ENOMEM */
734	async_extent->start = start;
735	async_extent->ram_size = ram_size;
736	async_extent->compressed_size = compressed_size;
737	async_extent->pages = pages;
738	async_extent->nr_pages = nr_pages;
739	async_extent->compress_type = compress_type;
740	list_add_tail(&async_extent->list, &cow->extents);
741	return 0;
742}
743
744/*
745 * Check if the inode needs to be submitted to compression, based on mount
746 * options, defragmentation, properties or heuristics.
747 */
748static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
749				      u64 end)
750{
751	struct btrfs_fs_info *fs_info = inode->root->fs_info;
752
753	if (!btrfs_inode_can_compress(inode)) {
754		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
755			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
756			btrfs_ino(inode));
757		return 0;
758	}
759	/*
760	 * Special check for subpage.
761	 *
762	 * We lock the full page then run each delalloc range in the page, thus
763	 * for the following case, we will hit some subpage specific corner case:
764	 *
765	 * 0		32K		64K
766	 * |	|///////|	|///////|
767	 *		\- A		\- B
768	 *
769	 * In above case, both range A and range B will try to unlock the full
770	 * page [0, 64K), causing the one finished later will have page
771	 * unlocked already, triggering various page lock requirement BUG_ON()s.
772	 *
773	 * So here we add an artificial limit that subpage compression can only
774	 * if the range is fully page aligned.
775	 *
776	 * In theory we only need to ensure the first page is fully covered, but
777	 * the tailing partial page will be locked until the full compression
778	 * finishes, delaying the write of other range.
779	 *
780	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
781	 * first to prevent any submitted async extent to unlock the full page.
782	 * By this, we can ensure for subpage case that only the last async_cow
783	 * will unlock the full page.
784	 */
785	if (fs_info->sectorsize < PAGE_SIZE) {
786		if (!PAGE_ALIGNED(start) ||
787		    !PAGE_ALIGNED(end + 1))
788			return 0;
789	}
790
791	/* force compress */
792	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
793		return 1;
794	/* defrag ioctl */
795	if (inode->defrag_compress)
796		return 1;
797	/* bad compression ratios */
798	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
799		return 0;
800	if (btrfs_test_opt(fs_info, COMPRESS) ||
801	    inode->flags & BTRFS_INODE_COMPRESS ||
802	    inode->prop_compress)
803		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
804	return 0;
805}
806
807static inline void inode_should_defrag(struct btrfs_inode *inode,
808		u64 start, u64 end, u64 num_bytes, u32 small_write)
809{
810	/* If this is a small write inside eof, kick off a defrag */
811	if (num_bytes < small_write &&
812	    (start > 0 || end + 1 < inode->disk_i_size))
813		btrfs_add_inode_defrag(NULL, inode, small_write);
814}
815
816/*
817 * Work queue call back to started compression on a file and pages.
818 *
819 * This is done inside an ordered work queue, and the compression is spread
820 * across many cpus.  The actual IO submission is step two, and the ordered work
821 * queue takes care of making sure that happens in the same order things were
822 * put onto the queue by writepages and friends.
823 *
824 * If this code finds it can't get good compression, it puts an entry onto the
825 * work queue to write the uncompressed bytes.  This makes sure that both
826 * compressed inodes and uncompressed inodes are written in the same order that
827 * the flusher thread sent them down.
828 */
829static void compress_file_range(struct btrfs_work *work)
830{
831	struct async_chunk *async_chunk =
832		container_of(work, struct async_chunk, work);
833	struct btrfs_inode *inode = async_chunk->inode;
834	struct btrfs_fs_info *fs_info = inode->root->fs_info;
835	struct address_space *mapping = inode->vfs_inode.i_mapping;
836	u64 blocksize = fs_info->sectorsize;
837	u64 start = async_chunk->start;
838	u64 end = async_chunk->end;
839	u64 actual_end;
840	u64 i_size;
841	int ret = 0;
842	struct page **pages;
843	unsigned long nr_pages;
844	unsigned long total_compressed = 0;
845	unsigned long total_in = 0;
846	unsigned int poff;
847	int i;
848	int compress_type = fs_info->compress_type;
849
850	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
851
852	/*
853	 * We need to call clear_page_dirty_for_io on each page in the range.
854	 * Otherwise applications with the file mmap'd can wander in and change
855	 * the page contents while we are compressing them.
856	 */
857	extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
858
859	/*
860	 * We need to save i_size before now because it could change in between
861	 * us evaluating the size and assigning it.  This is because we lock and
862	 * unlock the page in truncate and fallocate, and then modify the i_size
863	 * later on.
864	 *
865	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
866	 * does that for us.
867	 */
868	barrier();
869	i_size = i_size_read(&inode->vfs_inode);
870	barrier();
871	actual_end = min_t(u64, i_size, end + 1);
872again:
873	pages = NULL;
874	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
875	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
876
877	/*
878	 * we don't want to send crud past the end of i_size through
879	 * compression, that's just a waste of CPU time.  So, if the
880	 * end of the file is before the start of our current
881	 * requested range of bytes, we bail out to the uncompressed
882	 * cleanup code that can deal with all of this.
883	 *
884	 * It isn't really the fastest way to fix things, but this is a
885	 * very uncommon corner.
886	 */
887	if (actual_end <= start)
888		goto cleanup_and_bail_uncompressed;
889
890	total_compressed = actual_end - start;
891
892	/*
893	 * Skip compression for a small file range(<=blocksize) that
894	 * isn't an inline extent, since it doesn't save disk space at all.
895	 */
896	if (total_compressed <= blocksize &&
897	   (start > 0 || end + 1 < inode->disk_i_size))
898		goto cleanup_and_bail_uncompressed;
899
900	/*
901	 * For subpage case, we require full page alignment for the sector
902	 * aligned range.
903	 * Thus we must also check against @actual_end, not just @end.
904	 */
905	if (blocksize < PAGE_SIZE) {
906		if (!PAGE_ALIGNED(start) ||
907		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
908			goto cleanup_and_bail_uncompressed;
909	}
910
911	total_compressed = min_t(unsigned long, total_compressed,
912			BTRFS_MAX_UNCOMPRESSED);
913	total_in = 0;
914	ret = 0;
915
916	/*
917	 * We do compression for mount -o compress and when the inode has not
918	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
919	 * discover bad compression ratios.
920	 */
921	if (!inode_need_compress(inode, start, end))
922		goto cleanup_and_bail_uncompressed;
923
924	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
925	if (!pages) {
926		/*
927		 * Memory allocation failure is not a fatal error, we can fall
928		 * back to uncompressed code.
929		 */
930		goto cleanup_and_bail_uncompressed;
931	}
932
933	if (inode->defrag_compress)
934		compress_type = inode->defrag_compress;
935	else if (inode->prop_compress)
936		compress_type = inode->prop_compress;
937
938	/* Compression level is applied here. */
939	ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
940				   mapping, start, pages, &nr_pages, &total_in,
941				   &total_compressed);
942	if (ret)
943		goto mark_incompressible;
944
945	/*
946	 * Zero the tail end of the last page, as we might be sending it down
947	 * to disk.
948	 */
949	poff = offset_in_page(total_compressed);
950	if (poff)
951		memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
952
953	/*
954	 * Try to create an inline extent.
955	 *
956	 * If we didn't compress the entire range, try to create an uncompressed
957	 * inline extent, else a compressed one.
958	 *
959	 * Check cow_file_range() for why we don't even try to create inline
960	 * extent for the subpage case.
961	 */
962	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
963		if (total_in < actual_end) {
964			ret = cow_file_range_inline(inode, actual_end, 0,
965						    BTRFS_COMPRESS_NONE, NULL,
966						    false);
967		} else {
968			ret = cow_file_range_inline(inode, actual_end,
969						    total_compressed,
970						    compress_type, pages,
971						    false);
972		}
973		if (ret <= 0) {
974			unsigned long clear_flags = EXTENT_DELALLOC |
975				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
976				EXTENT_DO_ACCOUNTING;
977
978			if (ret < 0)
979				mapping_set_error(mapping, -EIO);
980
981			/*
982			 * inline extent creation worked or returned error,
983			 * we don't need to create any more async work items.
984			 * Unlock and free up our temp pages.
985			 *
986			 * We use DO_ACCOUNTING here because we need the
987			 * delalloc_release_metadata to be done _after_ we drop
988			 * our outstanding extent for clearing delalloc for this
989			 * range.
990			 */
991			extent_clear_unlock_delalloc(inode, start, end,
992						     NULL,
993						     clear_flags,
994						     PAGE_UNLOCK |
995						     PAGE_START_WRITEBACK |
996						     PAGE_END_WRITEBACK);
997			goto free_pages;
998		}
999	}
1000
1001	/*
1002	 * We aren't doing an inline extent. Round the compressed size up to a
1003	 * block size boundary so the allocator does sane things.
1004	 */
1005	total_compressed = ALIGN(total_compressed, blocksize);
1006
1007	/*
1008	 * One last check to make sure the compression is really a win, compare
1009	 * the page count read with the blocks on disk, compression must free at
1010	 * least one sector.
1011	 */
1012	total_in = round_up(total_in, fs_info->sectorsize);
1013	if (total_compressed + blocksize > total_in)
1014		goto mark_incompressible;
1015
1016	/*
1017	 * The async work queues will take care of doing actual allocation on
1018	 * disk for these compressed pages, and will submit the bios.
1019	 */
1020	add_async_extent(async_chunk, start, total_in, total_compressed, pages,
1021			 nr_pages, compress_type);
1022	if (start + total_in < end) {
1023		start += total_in;
1024		cond_resched();
1025		goto again;
1026	}
1027	return;
1028
1029mark_incompressible:
1030	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1031		inode->flags |= BTRFS_INODE_NOCOMPRESS;
1032cleanup_and_bail_uncompressed:
1033	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1034			 BTRFS_COMPRESS_NONE);
1035free_pages:
1036	if (pages) {
1037		for (i = 0; i < nr_pages; i++) {
1038			WARN_ON(pages[i]->mapping);
1039			put_page(pages[i]);
1040		}
1041		kfree(pages);
1042	}
1043}
1044
1045static void free_async_extent_pages(struct async_extent *async_extent)
1046{
1047	int i;
1048
1049	if (!async_extent->pages)
1050		return;
1051
1052	for (i = 0; i < async_extent->nr_pages; i++) {
1053		WARN_ON(async_extent->pages[i]->mapping);
1054		put_page(async_extent->pages[i]);
1055	}
1056	kfree(async_extent->pages);
1057	async_extent->nr_pages = 0;
1058	async_extent->pages = NULL;
1059}
1060
1061static void submit_uncompressed_range(struct btrfs_inode *inode,
1062				      struct async_extent *async_extent,
1063				      struct page *locked_page)
1064{
1065	u64 start = async_extent->start;
1066	u64 end = async_extent->start + async_extent->ram_size - 1;
1067	int ret;
1068	struct writeback_control wbc = {
1069		.sync_mode		= WB_SYNC_ALL,
1070		.range_start		= start,
1071		.range_end		= end,
1072		.no_cgroup_owner	= 1,
1073	};
1074
1075	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1076	ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
1077	wbc_detach_inode(&wbc);
1078	if (ret < 0) {
1079		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
1080		if (locked_page) {
1081			const u64 page_start = page_offset(locked_page);
1082
1083			set_page_writeback(locked_page);
1084			end_page_writeback(locked_page);
1085			btrfs_mark_ordered_io_finished(inode, locked_page,
1086						       page_start, PAGE_SIZE,
1087						       !ret);
1088			mapping_set_error(locked_page->mapping, ret);
1089			unlock_page(locked_page);
1090		}
1091	}
1092}
1093
1094static void submit_one_async_extent(struct async_chunk *async_chunk,
1095				    struct async_extent *async_extent,
1096				    u64 *alloc_hint)
1097{
1098	struct btrfs_inode *inode = async_chunk->inode;
1099	struct extent_io_tree *io_tree = &inode->io_tree;
1100	struct btrfs_root *root = inode->root;
1101	struct btrfs_fs_info *fs_info = root->fs_info;
1102	struct btrfs_ordered_extent *ordered;
1103	struct btrfs_key ins;
1104	struct page *locked_page = NULL;
1105	struct extent_map *em;
1106	int ret = 0;
1107	u64 start = async_extent->start;
1108	u64 end = async_extent->start + async_extent->ram_size - 1;
1109
1110	if (async_chunk->blkcg_css)
1111		kthread_associate_blkcg(async_chunk->blkcg_css);
1112
1113	/*
1114	 * If async_chunk->locked_page is in the async_extent range, we need to
1115	 * handle it.
1116	 */
1117	if (async_chunk->locked_page) {
1118		u64 locked_page_start = page_offset(async_chunk->locked_page);
1119		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
1120
1121		if (!(start >= locked_page_end || end <= locked_page_start))
1122			locked_page = async_chunk->locked_page;
1123	}
1124	lock_extent(io_tree, start, end, NULL);
1125
1126	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1127		submit_uncompressed_range(inode, async_extent, locked_page);
1128		goto done;
1129	}
1130
1131	ret = btrfs_reserve_extent(root, async_extent->ram_size,
1132				   async_extent->compressed_size,
1133				   async_extent->compressed_size,
1134				   0, *alloc_hint, &ins, 1, 1);
1135	if (ret) {
1136		/*
1137		 * Here we used to try again by going back to non-compressed
1138		 * path for ENOSPC.  But we can't reserve space even for
1139		 * compressed size, how could it work for uncompressed size
1140		 * which requires larger size?  So here we directly go error
1141		 * path.
1142		 */
1143		goto out_free;
1144	}
1145
1146	/* Here we're doing allocation and writeback of the compressed pages */
1147	em = create_io_em(inode, start,
1148			  async_extent->ram_size,	/* len */
1149			  start,			/* orig_start */
1150			  ins.objectid,			/* block_start */
1151			  ins.offset,			/* block_len */
1152			  ins.offset,			/* orig_block_len */
1153			  async_extent->ram_size,	/* ram_bytes */
1154			  async_extent->compress_type,
1155			  BTRFS_ORDERED_COMPRESSED);
1156	if (IS_ERR(em)) {
1157		ret = PTR_ERR(em);
1158		goto out_free_reserve;
1159	}
1160	free_extent_map(em);
1161
1162	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */
1163				       async_extent->ram_size,	/* num_bytes */
1164				       async_extent->ram_size,	/* ram_bytes */
1165				       ins.objectid,		/* disk_bytenr */
1166				       ins.offset,		/* disk_num_bytes */
1167				       0,			/* offset */
1168				       1 << BTRFS_ORDERED_COMPRESSED,
1169				       async_extent->compress_type);
1170	if (IS_ERR(ordered)) {
1171		btrfs_drop_extent_map_range(inode, start, end, false);
1172		ret = PTR_ERR(ordered);
1173		goto out_free_reserve;
1174	}
1175	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1176
1177	/* Clear dirty, set writeback and unlock the pages. */
1178	extent_clear_unlock_delalloc(inode, start, end,
1179			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
1180			PAGE_UNLOCK | PAGE_START_WRITEBACK);
1181	btrfs_submit_compressed_write(ordered,
1182			    async_extent->pages,	/* compressed_pages */
1183			    async_extent->nr_pages,
1184			    async_chunk->write_flags, true);
1185	*alloc_hint = ins.objectid + ins.offset;
1186done:
1187	if (async_chunk->blkcg_css)
1188		kthread_associate_blkcg(NULL);
1189	kfree(async_extent);
1190	return;
1191
1192out_free_reserve:
1193	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1194	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1195out_free:
1196	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1197	extent_clear_unlock_delalloc(inode, start, end,
1198				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1199				     EXTENT_DELALLOC_NEW |
1200				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1201				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1202				     PAGE_END_WRITEBACK);
1203	free_async_extent_pages(async_extent);
1204	if (async_chunk->blkcg_css)
1205		kthread_associate_blkcg(NULL);
1206	btrfs_debug(fs_info,
1207"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1208		    root->root_key.objectid, btrfs_ino(inode), start,
1209		    async_extent->ram_size, ret);
1210	kfree(async_extent);
1211}
1212
1213static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1214				      u64 num_bytes)
1215{
1216	struct extent_map_tree *em_tree = &inode->extent_tree;
1217	struct extent_map *em;
1218	u64 alloc_hint = 0;
1219
1220	read_lock(&em_tree->lock);
1221	em = search_extent_mapping(em_tree, start, num_bytes);
1222	if (em) {
1223		/*
1224		 * if block start isn't an actual block number then find the
1225		 * first block in this inode and use that as a hint.  If that
1226		 * block is also bogus then just don't worry about it.
1227		 */
1228		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1229			free_extent_map(em);
1230			em = search_extent_mapping(em_tree, 0, 0);
1231			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1232				alloc_hint = em->block_start;
1233			if (em)
1234				free_extent_map(em);
1235		} else {
1236			alloc_hint = em->block_start;
1237			free_extent_map(em);
1238		}
1239	}
1240	read_unlock(&em_tree->lock);
1241
1242	return alloc_hint;
1243}
1244
1245/*
1246 * when extent_io.c finds a delayed allocation range in the file,
1247 * the call backs end up in this code.  The basic idea is to
1248 * allocate extents on disk for the range, and create ordered data structs
1249 * in ram to track those extents.
1250 *
1251 * locked_page is the page that writepage had locked already.  We use
1252 * it to make sure we don't do extra locks or unlocks.
1253 *
1254 * When this function fails, it unlocks all pages except @locked_page.
1255 *
1256 * When this function successfully creates an inline extent, it returns 1 and
1257 * unlocks all pages including locked_page and starts I/O on them.
1258 * (In reality inline extents are limited to a single page, so locked_page is
1259 * the only page handled anyway).
1260 *
1261 * When this function succeed and creates a normal extent, the page locking
1262 * status depends on the passed in flags:
1263 *
1264 * - If @keep_locked is set, all pages are kept locked.
1265 * - Else all pages except for @locked_page are unlocked.
1266 *
1267 * When a failure happens in the second or later iteration of the
1268 * while-loop, the ordered extents created in previous iterations are kept
1269 * intact. So, the caller must clean them up by calling
1270 * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1271 * example.
1272 */
1273static noinline int cow_file_range(struct btrfs_inode *inode,
1274				   struct page *locked_page, u64 start, u64 end,
1275				   u64 *done_offset,
1276				   bool keep_locked, bool no_inline)
1277{
1278	struct btrfs_root *root = inode->root;
1279	struct btrfs_fs_info *fs_info = root->fs_info;
1280	u64 alloc_hint = 0;
1281	u64 orig_start = start;
1282	u64 num_bytes;
1283	unsigned long ram_size;
1284	u64 cur_alloc_size = 0;
1285	u64 min_alloc_size;
1286	u64 blocksize = fs_info->sectorsize;
1287	struct btrfs_key ins;
1288	struct extent_map *em;
1289	unsigned clear_bits;
1290	unsigned long page_ops;
1291	bool extent_reserved = false;
1292	int ret = 0;
1293
1294	if (btrfs_is_free_space_inode(inode)) {
1295		ret = -EINVAL;
1296		goto out_unlock;
1297	}
1298
1299	num_bytes = ALIGN(end - start + 1, blocksize);
1300	num_bytes = max(blocksize,  num_bytes);
1301	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1302
1303	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1304
1305	/*
1306	 * Due to the page size limit, for subpage we can only trigger the
1307	 * writeback for the dirty sectors of page, that means data writeback
1308	 * is doing more writeback than what we want.
1309	 *
1310	 * This is especially unexpected for some call sites like fallocate,
1311	 * where we only increase i_size after everything is done.
1312	 * This means we can trigger inline extent even if we didn't want to.
1313	 * So here we skip inline extent creation completely.
1314	 */
1315	if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
1316		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1317				       end + 1);
1318
1319		/* lets try to make an inline extent */
1320		ret = cow_file_range_inline(inode, actual_end, 0,
1321					    BTRFS_COMPRESS_NONE, NULL, false);
1322		if (ret == 0) {
1323			/*
1324			 * We use DO_ACCOUNTING here because we need the
1325			 * delalloc_release_metadata to be run _after_ we drop
1326			 * our outstanding extent for clearing delalloc for this
1327			 * range.
1328			 */
1329			extent_clear_unlock_delalloc(inode, start, end,
1330				     locked_page,
1331				     EXTENT_LOCKED | EXTENT_DELALLOC |
1332				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1333				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1334				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1335			/*
1336			 * locked_page is locked by the caller of
1337			 * writepage_delalloc(), not locked by
1338			 * __process_pages_contig().
1339			 *
1340			 * We can't let __process_pages_contig() to unlock it,
1341			 * as it doesn't have any subpage::writers recorded.
1342			 *
1343			 * Here we manually unlock the page, since the caller
1344			 * can't determine if it's an inline extent or a
1345			 * compressed extent.
1346			 */
1347			unlock_page(locked_page);
1348			ret = 1;
1349			goto done;
1350		} else if (ret < 0) {
1351			goto out_unlock;
1352		}
1353	}
1354
1355	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1356
1357	/*
1358	 * Relocation relies on the relocated extents to have exactly the same
1359	 * size as the original extents. Normally writeback for relocation data
1360	 * extents follows a NOCOW path because relocation preallocates the
1361	 * extents. However, due to an operation such as scrub turning a block
1362	 * group to RO mode, it may fallback to COW mode, so we must make sure
1363	 * an extent allocated during COW has exactly the requested size and can
1364	 * not be split into smaller extents, otherwise relocation breaks and
1365	 * fails during the stage where it updates the bytenr of file extent
1366	 * items.
1367	 */
1368	if (btrfs_is_data_reloc_root(root))
1369		min_alloc_size = num_bytes;
1370	else
1371		min_alloc_size = fs_info->sectorsize;
1372
1373	while (num_bytes > 0) {
1374		struct btrfs_ordered_extent *ordered;
1375
1376		cur_alloc_size = num_bytes;
1377		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1378					   min_alloc_size, 0, alloc_hint,
1379					   &ins, 1, 1);
1380		if (ret == -EAGAIN) {
1381			/*
1382			 * btrfs_reserve_extent only returns -EAGAIN for zoned
1383			 * file systems, which is an indication that there are
1384			 * no active zones to allocate from at the moment.
1385			 *
1386			 * If this is the first loop iteration, wait for at
1387			 * least one zone to finish before retrying the
1388			 * allocation.  Otherwise ask the caller to write out
1389			 * the already allocated blocks before coming back to
1390			 * us, or return -ENOSPC if it can't handle retries.
1391			 */
1392			ASSERT(btrfs_is_zoned(fs_info));
1393			if (start == orig_start) {
1394				wait_on_bit_io(&inode->root->fs_info->flags,
1395					       BTRFS_FS_NEED_ZONE_FINISH,
1396					       TASK_UNINTERRUPTIBLE);
1397				continue;
1398			}
1399			if (done_offset) {
1400				*done_offset = start - 1;
1401				return 0;
1402			}
1403			ret = -ENOSPC;
1404		}
1405		if (ret < 0)
1406			goto out_unlock;
1407		cur_alloc_size = ins.offset;
1408		extent_reserved = true;
1409
1410		ram_size = ins.offset;
1411		em = create_io_em(inode, start, ins.offset, /* len */
1412				  start, /* orig_start */
1413				  ins.objectid, /* block_start */
1414				  ins.offset, /* block_len */
1415				  ins.offset, /* orig_block_len */
1416				  ram_size, /* ram_bytes */
1417				  BTRFS_COMPRESS_NONE, /* compress_type */
1418				  BTRFS_ORDERED_REGULAR /* type */);
1419		if (IS_ERR(em)) {
1420			ret = PTR_ERR(em);
1421			goto out_reserve;
1422		}
1423		free_extent_map(em);
1424
1425		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
1426					ram_size, ins.objectid, cur_alloc_size,
1427					0, 1 << BTRFS_ORDERED_REGULAR,
1428					BTRFS_COMPRESS_NONE);
1429		if (IS_ERR(ordered)) {
1430			ret = PTR_ERR(ordered);
1431			goto out_drop_extent_cache;
1432		}
1433
1434		if (btrfs_is_data_reloc_root(root)) {
1435			ret = btrfs_reloc_clone_csums(ordered);
1436
1437			/*
1438			 * Only drop cache here, and process as normal.
1439			 *
1440			 * We must not allow extent_clear_unlock_delalloc()
1441			 * at out_unlock label to free meta of this ordered
1442			 * extent, as its meta should be freed by
1443			 * btrfs_finish_ordered_io().
1444			 *
1445			 * So we must continue until @start is increased to
1446			 * skip current ordered extent.
1447			 */
1448			if (ret)
1449				btrfs_drop_extent_map_range(inode, start,
1450							    start + ram_size - 1,
1451							    false);
1452		}
1453		btrfs_put_ordered_extent(ordered);
1454
1455		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1456
1457		/*
1458		 * We're not doing compressed IO, don't unlock the first page
1459		 * (which the caller expects to stay locked), don't clear any
1460		 * dirty bits and don't set any writeback bits
1461		 *
1462		 * Do set the Ordered (Private2) bit so we know this page was
1463		 * properly setup for writepage.
1464		 */
1465		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1466		page_ops |= PAGE_SET_ORDERED;
1467
1468		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1469					     locked_page,
1470					     EXTENT_LOCKED | EXTENT_DELALLOC,
1471					     page_ops);
1472		if (num_bytes < cur_alloc_size)
1473			num_bytes = 0;
1474		else
1475			num_bytes -= cur_alloc_size;
1476		alloc_hint = ins.objectid + ins.offset;
1477		start += cur_alloc_size;
1478		extent_reserved = false;
1479
1480		/*
1481		 * btrfs_reloc_clone_csums() error, since start is increased
1482		 * extent_clear_unlock_delalloc() at out_unlock label won't
1483		 * free metadata of current ordered extent, we're OK to exit.
1484		 */
1485		if (ret)
1486			goto out_unlock;
1487	}
1488done:
1489	if (done_offset)
1490		*done_offset = end;
1491	return ret;
1492
1493out_drop_extent_cache:
1494	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
1495out_reserve:
1496	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1497	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1498out_unlock:
1499	/*
1500	 * Now, we have three regions to clean up:
1501	 *
1502	 * |-------(1)----|---(2)---|-------------(3)----------|
1503	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
1504	 *
1505	 * We process each region below.
1506	 */
1507
1508	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1509		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1510	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1511
1512	/*
1513	 * For the range (1). We have already instantiated the ordered extents
1514	 * for this region. They are cleaned up by
1515	 * btrfs_cleanup_ordered_extents() in e.g,
1516	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1517	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1518	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1519	 * function.
1520	 *
1521	 * However, in case of @keep_locked, we still need to unlock the pages
1522	 * (except @locked_page) to ensure all the pages are unlocked.
1523	 */
1524	if (keep_locked && orig_start < start) {
1525		if (!locked_page)
1526			mapping_set_error(inode->vfs_inode.i_mapping, ret);
1527		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1528					     locked_page, 0, page_ops);
1529	}
1530
1531	/*
1532	 * For the range (2). If we reserved an extent for our delalloc range
1533	 * (or a subrange) and failed to create the respective ordered extent,
1534	 * then it means that when we reserved the extent we decremented the
1535	 * extent's size from the data space_info's bytes_may_use counter and
1536	 * incremented the space_info's bytes_reserved counter by the same
1537	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1538	 * to decrement again the data space_info's bytes_may_use counter,
1539	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1540	 */
1541	if (extent_reserved) {
1542		extent_clear_unlock_delalloc(inode, start,
1543					     start + cur_alloc_size - 1,
1544					     locked_page,
1545					     clear_bits,
1546					     page_ops);
1547		start += cur_alloc_size;
1548	}
1549
1550	/*
1551	 * For the range (3). We never touched the region. In addition to the
1552	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1553	 * space_info's bytes_may_use counter, reserved in
1554	 * btrfs_check_data_free_space().
1555	 */
1556	if (start < end) {
1557		clear_bits |= EXTENT_CLEAR_DATA_RESV;
1558		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1559					     clear_bits, page_ops);
1560	}
1561	return ret;
1562}
1563
1564/*
1565 * Phase two of compressed writeback.  This is the ordered portion of the code,
1566 * which only gets called in the order the work was queued.  We walk all the
1567 * async extents created by compress_file_range and send them down to the disk.
1568 */
1569static noinline void submit_compressed_extents(struct btrfs_work *work)
1570{
1571	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1572						     work);
1573	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1574	struct async_extent *async_extent;
1575	unsigned long nr_pages;
1576	u64 alloc_hint = 0;
1577
1578	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1579		PAGE_SHIFT;
1580
1581	while (!list_empty(&async_chunk->extents)) {
1582		async_extent = list_entry(async_chunk->extents.next,
1583					  struct async_extent, list);
1584		list_del(&async_extent->list);
1585		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1586	}
1587
1588	/* atomic_sub_return implies a barrier */
1589	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1590	    5 * SZ_1M)
1591		cond_wake_up_nomb(&fs_info->async_submit_wait);
1592}
1593
1594static noinline void async_cow_free(struct btrfs_work *work)
1595{
1596	struct async_chunk *async_chunk;
1597	struct async_cow *async_cow;
1598
1599	async_chunk = container_of(work, struct async_chunk, work);
1600	btrfs_add_delayed_iput(async_chunk->inode);
1601	if (async_chunk->blkcg_css)
1602		css_put(async_chunk->blkcg_css);
1603
1604	async_cow = async_chunk->async_cow;
1605	if (atomic_dec_and_test(&async_cow->num_chunks))
1606		kvfree(async_cow);
1607}
1608
1609static bool run_delalloc_compressed(struct btrfs_inode *inode,
1610				    struct page *locked_page, u64 start,
1611				    u64 end, struct writeback_control *wbc)
1612{
1613	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1614	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1615	struct async_cow *ctx;
1616	struct async_chunk *async_chunk;
1617	unsigned long nr_pages;
1618	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1619	int i;
1620	unsigned nofs_flag;
1621	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1622
1623	nofs_flag = memalloc_nofs_save();
1624	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1625	memalloc_nofs_restore(nofs_flag);
1626	if (!ctx)
1627		return false;
1628
1629	unlock_extent(&inode->io_tree, start, end, NULL);
1630	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1631
1632	async_chunk = ctx->chunks;
1633	atomic_set(&ctx->num_chunks, num_chunks);
1634
1635	for (i = 0; i < num_chunks; i++) {
1636		u64 cur_end = min(end, start + SZ_512K - 1);
1637
1638		/*
1639		 * igrab is called higher up in the call chain, take only the
1640		 * lightweight reference for the callback lifetime
1641		 */
1642		ihold(&inode->vfs_inode);
1643		async_chunk[i].async_cow = ctx;
1644		async_chunk[i].inode = inode;
1645		async_chunk[i].start = start;
1646		async_chunk[i].end = cur_end;
1647		async_chunk[i].write_flags = write_flags;
1648		INIT_LIST_HEAD(&async_chunk[i].extents);
1649
1650		/*
1651		 * The locked_page comes all the way from writepage and its
1652		 * the original page we were actually given.  As we spread
1653		 * this large delalloc region across multiple async_chunk
1654		 * structs, only the first struct needs a pointer to locked_page
1655		 *
1656		 * This way we don't need racey decisions about who is supposed
1657		 * to unlock it.
1658		 */
1659		if (locked_page) {
1660			/*
1661			 * Depending on the compressibility, the pages might or
1662			 * might not go through async.  We want all of them to
1663			 * be accounted against wbc once.  Let's do it here
1664			 * before the paths diverge.  wbc accounting is used
1665			 * only for foreign writeback detection and doesn't
1666			 * need full accuracy.  Just account the whole thing
1667			 * against the first page.
1668			 */
1669			wbc_account_cgroup_owner(wbc, locked_page,
1670						 cur_end - start);
1671			async_chunk[i].locked_page = locked_page;
1672			locked_page = NULL;
1673		} else {
1674			async_chunk[i].locked_page = NULL;
1675		}
1676
1677		if (blkcg_css != blkcg_root_css) {
1678			css_get(blkcg_css);
1679			async_chunk[i].blkcg_css = blkcg_css;
1680			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1681		} else {
1682			async_chunk[i].blkcg_css = NULL;
1683		}
1684
1685		btrfs_init_work(&async_chunk[i].work, compress_file_range,
1686				submit_compressed_extents, async_cow_free);
1687
1688		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1689		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1690
1691		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1692
1693		start = cur_end + 1;
1694	}
1695	return true;
1696}
1697
1698/*
1699 * Run the delalloc range from start to end, and write back any dirty pages
1700 * covered by the range.
1701 */
1702static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1703				     struct page *locked_page, u64 start,
1704				     u64 end, struct writeback_control *wbc,
1705				     bool pages_dirty)
1706{
1707	u64 done_offset = end;
1708	int ret;
1709
1710	while (start <= end) {
1711		ret = cow_file_range(inode, locked_page, start, end, &done_offset,
1712				     true, false);
1713		if (ret)
1714			return ret;
1715		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
1716					  done_offset, wbc, pages_dirty);
1717		start = done_offset + 1;
1718	}
1719
1720	return 1;
1721}
1722
1723static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1724					u64 bytenr, u64 num_bytes, bool nowait)
1725{
1726	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1727	struct btrfs_ordered_sum *sums;
1728	int ret;
1729	LIST_HEAD(list);
1730
1731	ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
1732				      &list, 0, nowait);
1733	if (ret == 0 && list_empty(&list))
1734		return 0;
1735
1736	while (!list_empty(&list)) {
1737		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1738		list_del(&sums->list);
1739		kfree(sums);
1740	}
1741	if (ret < 0)
1742		return ret;
1743	return 1;
1744}
1745
1746static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1747			   const u64 start, const u64 end)
1748{
1749	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1750	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1751	const u64 range_bytes = end + 1 - start;
1752	struct extent_io_tree *io_tree = &inode->io_tree;
1753	u64 range_start = start;
1754	u64 count;
1755	int ret;
1756
1757	/*
1758	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1759	 * made we had not enough available data space and therefore we did not
1760	 * reserve data space for it, since we though we could do NOCOW for the
1761	 * respective file range (either there is prealloc extent or the inode
1762	 * has the NOCOW bit set).
1763	 *
1764	 * However when we need to fallback to COW mode (because for example the
1765	 * block group for the corresponding extent was turned to RO mode by a
1766	 * scrub or relocation) we need to do the following:
1767	 *
1768	 * 1) We increment the bytes_may_use counter of the data space info.
1769	 *    If COW succeeds, it allocates a new data extent and after doing
1770	 *    that it decrements the space info's bytes_may_use counter and
1771	 *    increments its bytes_reserved counter by the same amount (we do
1772	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1773	 *    bytes_may_use counter to compensate (when space is reserved at
1774	 *    buffered write time, the bytes_may_use counter is incremented);
1775	 *
1776	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1777	 *    that if the COW path fails for any reason, it decrements (through
1778	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1779	 *    data space info, which we incremented in the step above.
1780	 *
1781	 * If we need to fallback to cow and the inode corresponds to a free
1782	 * space cache inode or an inode of the data relocation tree, we must
1783	 * also increment bytes_may_use of the data space_info for the same
1784	 * reason. Space caches and relocated data extents always get a prealloc
1785	 * extent for them, however scrub or balance may have set the block
1786	 * group that contains that extent to RO mode and therefore force COW
1787	 * when starting writeback.
1788	 */
1789	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1790				 EXTENT_NORESERVE, 0, NULL);
1791	if (count > 0 || is_space_ino || is_reloc_ino) {
1792		u64 bytes = count;
1793		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1794		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1795
1796		if (is_space_ino || is_reloc_ino)
1797			bytes = range_bytes;
1798
1799		spin_lock(&sinfo->lock);
1800		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1801		spin_unlock(&sinfo->lock);
1802
1803		if (count > 0)
1804			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1805					 NULL);
1806	}
1807
1808	/*
1809	 * Don't try to create inline extents, as a mix of inline extent that
1810	 * is written out and unlocked directly and a normal NOCOW extent
1811	 * doesn't work.
1812	 */
1813	ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
1814	ASSERT(ret != 1);
1815	return ret;
1816}
1817
1818struct can_nocow_file_extent_args {
1819	/* Input fields. */
1820
1821	/* Start file offset of the range we want to NOCOW. */
1822	u64 start;
1823	/* End file offset (inclusive) of the range we want to NOCOW. */
1824	u64 end;
1825	bool writeback_path;
1826	bool strict;
1827	/*
1828	 * Free the path passed to can_nocow_file_extent() once it's not needed
1829	 * anymore.
1830	 */
1831	bool free_path;
1832
1833	/* Output fields. Only set when can_nocow_file_extent() returns 1. */
1834
1835	u64 disk_bytenr;
1836	u64 disk_num_bytes;
1837	u64 extent_offset;
1838	/* Number of bytes that can be written to in NOCOW mode. */
1839	u64 num_bytes;
1840};
1841
1842/*
1843 * Check if we can NOCOW the file extent that the path points to.
1844 * This function may return with the path released, so the caller should check
1845 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1846 *
1847 * Returns: < 0 on error
1848 *            0 if we can not NOCOW
1849 *            1 if we can NOCOW
1850 */
1851static int can_nocow_file_extent(struct btrfs_path *path,
1852				 struct btrfs_key *key,
1853				 struct btrfs_inode *inode,
1854				 struct can_nocow_file_extent_args *args)
1855{
1856	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1857	struct extent_buffer *leaf = path->nodes[0];
1858	struct btrfs_root *root = inode->root;
1859	struct btrfs_file_extent_item *fi;
1860	u64 extent_end;
1861	u8 extent_type;
1862	int can_nocow = 0;
1863	int ret = 0;
1864	bool nowait = path->nowait;
1865
1866	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1867	extent_type = btrfs_file_extent_type(leaf, fi);
1868
1869	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1870		goto out;
1871
1872	/* Can't access these fields unless we know it's not an inline extent. */
1873	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1874	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1875	args->extent_offset = btrfs_file_extent_offset(leaf, fi);
1876
1877	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1878	    extent_type == BTRFS_FILE_EXTENT_REG)
1879		goto out;
1880
1881	/*
1882	 * If the extent was created before the generation where the last snapshot
1883	 * for its subvolume was created, then this implies the extent is shared,
1884	 * hence we must COW.
1885	 */
1886	if (!args->strict &&
1887	    btrfs_file_extent_generation(leaf, fi) <=
1888	    btrfs_root_last_snapshot(&root->root_item))
1889		goto out;
1890
1891	/* An explicit hole, must COW. */
1892	if (args->disk_bytenr == 0)
1893		goto out;
1894
1895	/* Compressed/encrypted/encoded extents must be COWed. */
1896	if (btrfs_file_extent_compression(leaf, fi) ||
1897	    btrfs_file_extent_encryption(leaf, fi) ||
1898	    btrfs_file_extent_other_encoding(leaf, fi))
1899		goto out;
1900
1901	extent_end = btrfs_file_extent_end(path);
1902
1903	/*
1904	 * The following checks can be expensive, as they need to take other
1905	 * locks and do btree or rbtree searches, so release the path to avoid
1906	 * blocking other tasks for too long.
1907	 */
1908	btrfs_release_path(path);
1909
1910	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
1911				    key->offset - args->extent_offset,
1912				    args->disk_bytenr, args->strict, path);
1913	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1914	if (ret != 0)
1915		goto out;
1916
1917	if (args->free_path) {
1918		/*
1919		 * We don't need the path anymore, plus through the
1920		 * csum_exist_in_range() call below we will end up allocating
1921		 * another path. So free the path to avoid unnecessary extra
1922		 * memory usage.
1923		 */
1924		btrfs_free_path(path);
1925		path = NULL;
1926	}
1927
1928	/* If there are pending snapshots for this root, we must COW. */
1929	if (args->writeback_path && !is_freespace_inode &&
1930	    atomic_read(&root->snapshot_force_cow))
1931		goto out;
1932
1933	args->disk_bytenr += args->extent_offset;
1934	args->disk_bytenr += args->start - key->offset;
1935	args->num_bytes = min(args->end + 1, extent_end) - args->start;
1936
1937	/*
1938	 * Force COW if csums exist in the range. This ensures that csums for a
1939	 * given extent are either valid or do not exist.
1940	 */
1941	ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
1942				  nowait);
1943	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1944	if (ret != 0)
1945		goto out;
1946
1947	can_nocow = 1;
1948 out:
1949	if (args->free_path && path)
1950		btrfs_free_path(path);
1951
1952	return ret < 0 ? ret : can_nocow;
1953}
1954
1955/*
1956 * when nowcow writeback call back.  This checks for snapshots or COW copies
1957 * of the extents that exist in the file, and COWs the file as required.
1958 *
1959 * If no cow copies or snapshots exist, we write directly to the existing
1960 * blocks on disk
1961 */
1962static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1963				       struct page *locked_page,
1964				       const u64 start, const u64 end)
1965{
1966	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1967	struct btrfs_root *root = inode->root;
1968	struct btrfs_path *path;
1969	u64 cow_start = (u64)-1;
1970	u64 cur_offset = start;
1971	int ret;
1972	bool check_prev = true;
1973	u64 ino = btrfs_ino(inode);
1974	struct can_nocow_file_extent_args nocow_args = { 0 };
1975
1976	/*
1977	 * Normally on a zoned device we're only doing COW writes, but in case
1978	 * of relocation on a zoned filesystem serializes I/O so that we're only
1979	 * writing sequentially and can end up here as well.
1980	 */
1981	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
1982
1983	path = btrfs_alloc_path();
1984	if (!path) {
1985		ret = -ENOMEM;
1986		goto error;
1987	}
1988
1989	nocow_args.end = end;
1990	nocow_args.writeback_path = true;
1991
1992	while (1) {
1993		struct btrfs_block_group *nocow_bg = NULL;
1994		struct btrfs_ordered_extent *ordered;
1995		struct btrfs_key found_key;
1996		struct btrfs_file_extent_item *fi;
1997		struct extent_buffer *leaf;
1998		u64 extent_end;
1999		u64 ram_bytes;
2000		u64 nocow_end;
2001		int extent_type;
2002		bool is_prealloc;
2003
2004		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2005					       cur_offset, 0);
2006		if (ret < 0)
2007			goto error;
2008
2009		/*
2010		 * If there is no extent for our range when doing the initial
2011		 * search, then go back to the previous slot as it will be the
2012		 * one containing the search offset
2013		 */
2014		if (ret > 0 && path->slots[0] > 0 && check_prev) {
2015			leaf = path->nodes[0];
2016			btrfs_item_key_to_cpu(leaf, &found_key,
2017					      path->slots[0] - 1);
2018			if (found_key.objectid == ino &&
2019			    found_key.type == BTRFS_EXTENT_DATA_KEY)
2020				path->slots[0]--;
2021		}
2022		check_prev = false;
2023next_slot:
2024		/* Go to next leaf if we have exhausted the current one */
2025		leaf = path->nodes[0];
2026		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2027			ret = btrfs_next_leaf(root, path);
2028			if (ret < 0)
2029				goto error;
2030			if (ret > 0)
2031				break;
2032			leaf = path->nodes[0];
2033		}
2034
2035		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2036
2037		/* Didn't find anything for our INO */
2038		if (found_key.objectid > ino)
2039			break;
2040		/*
2041		 * Keep searching until we find an EXTENT_ITEM or there are no
2042		 * more extents for this inode
2043		 */
2044		if (WARN_ON_ONCE(found_key.objectid < ino) ||
2045		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
2046			path->slots[0]++;
2047			goto next_slot;
2048		}
2049
2050		/* Found key is not EXTENT_DATA_KEY or starts after req range */
2051		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2052		    found_key.offset > end)
2053			break;
2054
2055		/*
2056		 * If the found extent starts after requested offset, then
2057		 * adjust extent_end to be right before this extent begins
2058		 */
2059		if (found_key.offset > cur_offset) {
2060			extent_end = found_key.offset;
2061			extent_type = 0;
2062			goto must_cow;
2063		}
2064
2065		/*
2066		 * Found extent which begins before our range and potentially
2067		 * intersect it
2068		 */
2069		fi = btrfs_item_ptr(leaf, path->slots[0],
2070				    struct btrfs_file_extent_item);
2071		extent_type = btrfs_file_extent_type(leaf, fi);
2072		/* If this is triggered then we have a memory corruption. */
2073		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2074		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2075			ret = -EUCLEAN;
2076			goto error;
2077		}
2078		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
2079		extent_end = btrfs_file_extent_end(path);
2080
2081		/*
2082		 * If the extent we got ends before our current offset, skip to
2083		 * the next extent.
2084		 */
2085		if (extent_end <= cur_offset) {
2086			path->slots[0]++;
2087			goto next_slot;
2088		}
2089
2090		nocow_args.start = cur_offset;
2091		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2092		if (ret < 0)
2093			goto error;
2094		if (ret == 0)
2095			goto must_cow;
2096
2097		ret = 0;
2098		nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
2099		if (!nocow_bg) {
2100must_cow:
2101			/*
2102			 * If we can't perform NOCOW writeback for the range,
2103			 * then record the beginning of the range that needs to
2104			 * be COWed.  It will be written out before the next
2105			 * NOCOW range if we find one, or when exiting this
2106			 * loop.
2107			 */
2108			if (cow_start == (u64)-1)
2109				cow_start = cur_offset;
2110			cur_offset = extent_end;
2111			if (cur_offset > end)
2112				break;
2113			if (!path->nodes[0])
2114				continue;
2115			path->slots[0]++;
2116			goto next_slot;
2117		}
2118
2119		/*
2120		 * COW range from cow_start to found_key.offset - 1. As the key
2121		 * will contain the beginning of the first extent that can be
2122		 * NOCOW, following one which needs to be COW'ed
2123		 */
2124		if (cow_start != (u64)-1) {
2125			ret = fallback_to_cow(inode, locked_page,
2126					      cow_start, found_key.offset - 1);
2127			cow_start = (u64)-1;
2128			if (ret) {
2129				btrfs_dec_nocow_writers(nocow_bg);
2130				goto error;
2131			}
2132		}
2133
2134		nocow_end = cur_offset + nocow_args.num_bytes - 1;
2135		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2136		if (is_prealloc) {
2137			u64 orig_start = found_key.offset - nocow_args.extent_offset;
2138			struct extent_map *em;
2139
2140			em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
2141					  orig_start,
2142					  nocow_args.disk_bytenr, /* block_start */
2143					  nocow_args.num_bytes, /* block_len */
2144					  nocow_args.disk_num_bytes, /* orig_block_len */
2145					  ram_bytes, BTRFS_COMPRESS_NONE,
2146					  BTRFS_ORDERED_PREALLOC);
2147			if (IS_ERR(em)) {
2148				btrfs_dec_nocow_writers(nocow_bg);
2149				ret = PTR_ERR(em);
2150				goto error;
2151			}
2152			free_extent_map(em);
2153		}
2154
2155		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
2156				nocow_args.num_bytes, nocow_args.num_bytes,
2157				nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
2158				is_prealloc
2159				? (1 << BTRFS_ORDERED_PREALLOC)
2160				: (1 << BTRFS_ORDERED_NOCOW),
2161				BTRFS_COMPRESS_NONE);
2162		btrfs_dec_nocow_writers(nocow_bg);
2163		if (IS_ERR(ordered)) {
2164			if (is_prealloc) {
2165				btrfs_drop_extent_map_range(inode, cur_offset,
2166							    nocow_end, false);
2167			}
2168			ret = PTR_ERR(ordered);
2169			goto error;
2170		}
2171
2172		if (btrfs_is_data_reloc_root(root))
2173			/*
2174			 * Error handled later, as we must prevent
2175			 * extent_clear_unlock_delalloc() in error handler
2176			 * from freeing metadata of created ordered extent.
2177			 */
2178			ret = btrfs_reloc_clone_csums(ordered);
2179		btrfs_put_ordered_extent(ordered);
2180
2181		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
2182					     locked_page, EXTENT_LOCKED |
2183					     EXTENT_DELALLOC |
2184					     EXTENT_CLEAR_DATA_RESV,
2185					     PAGE_UNLOCK | PAGE_SET_ORDERED);
2186
2187		cur_offset = extent_end;
2188
2189		/*
2190		 * btrfs_reloc_clone_csums() error, now we're OK to call error
2191		 * handler, as metadata for created ordered extent will only
2192		 * be freed by btrfs_finish_ordered_io().
2193		 */
2194		if (ret)
2195			goto error;
2196		if (cur_offset > end)
2197			break;
2198	}
2199	btrfs_release_path(path);
2200
2201	if (cur_offset <= end && cow_start == (u64)-1)
2202		cow_start = cur_offset;
2203
2204	if (cow_start != (u64)-1) {
2205		cur_offset = end;
2206		ret = fallback_to_cow(inode, locked_page, cow_start, end);
2207		cow_start = (u64)-1;
2208		if (ret)
2209			goto error;
2210	}
2211
2212	btrfs_free_path(path);
2213	return 0;
2214
2215error:
2216	/*
2217	 * If an error happened while a COW region is outstanding, cur_offset
2218	 * needs to be reset to cow_start to ensure the COW region is unlocked
2219	 * as well.
2220	 */
2221	if (cow_start != (u64)-1)
2222		cur_offset = cow_start;
2223	if (cur_offset < end)
2224		extent_clear_unlock_delalloc(inode, cur_offset, end,
2225					     locked_page, EXTENT_LOCKED |
2226					     EXTENT_DELALLOC | EXTENT_DEFRAG |
2227					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2228					     PAGE_START_WRITEBACK |
2229					     PAGE_END_WRITEBACK);
2230	btrfs_free_path(path);
2231	return ret;
2232}
2233
2234static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2235{
2236	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2237		if (inode->defrag_bytes &&
2238		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
2239				   0, NULL))
2240			return false;
2241		return true;
2242	}
2243	return false;
2244}
2245
2246/*
2247 * Function to process delayed allocation (create CoW) for ranges which are
2248 * being touched for the first time.
2249 */
2250int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
2251			     u64 start, u64 end, struct writeback_control *wbc)
2252{
2253	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2254	int ret;
2255
2256	/*
2257	 * The range must cover part of the @locked_page, or a return of 1
2258	 * can confuse the caller.
2259	 */
2260	ASSERT(!(end <= page_offset(locked_page) ||
2261		 start >= page_offset(locked_page) + PAGE_SIZE));
2262
2263	if (should_nocow(inode, start, end)) {
2264		ret = run_delalloc_nocow(inode, locked_page, start, end);
2265		goto out;
2266	}
2267
2268	if (btrfs_inode_can_compress(inode) &&
2269	    inode_need_compress(inode, start, end) &&
2270	    run_delalloc_compressed(inode, locked_page, start, end, wbc))
2271		return 1;
2272
2273	if (zoned)
2274		ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
2275				       true);
2276	else
2277		ret = cow_file_range(inode, locked_page, start, end, NULL,
2278				     false, false);
2279
2280out:
2281	if (ret < 0)
2282		btrfs_cleanup_ordered_extents(inode, locked_page, start,
2283					      end - start + 1);
2284	return ret;
2285}
2286
2287void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2288				 struct extent_state *orig, u64 split)
2289{
2290	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2291	u64 size;
2292
2293	/* not delalloc, ignore it */
2294	if (!(orig->state & EXTENT_DELALLOC))
2295		return;
2296
2297	size = orig->end - orig->start + 1;
2298	if (size > fs_info->max_extent_size) {
2299		u32 num_extents;
2300		u64 new_size;
2301
2302		/*
2303		 * See the explanation in btrfs_merge_delalloc_extent, the same
2304		 * applies here, just in reverse.
2305		 */
2306		new_size = orig->end - split + 1;
2307		num_extents = count_max_extents(fs_info, new_size);
2308		new_size = split - orig->start;
2309		num_extents += count_max_extents(fs_info, new_size);
2310		if (count_max_extents(fs_info, size) >= num_extents)
2311			return;
2312	}
2313
2314	spin_lock(&inode->lock);
2315	btrfs_mod_outstanding_extents(inode, 1);
2316	spin_unlock(&inode->lock);
2317}
2318
2319/*
2320 * Handle merged delayed allocation extents so we can keep track of new extents
2321 * that are just merged onto old extents, such as when we are doing sequential
2322 * writes, so we can properly account for the metadata space we'll need.
2323 */
2324void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2325				 struct extent_state *other)
2326{
2327	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2328	u64 new_size, old_size;
2329	u32 num_extents;
2330
2331	/* not delalloc, ignore it */
2332	if (!(other->state & EXTENT_DELALLOC))
2333		return;
2334
2335	if (new->start > other->start)
2336		new_size = new->end - other->start + 1;
2337	else
2338		new_size = other->end - new->start + 1;
2339
2340	/* we're not bigger than the max, unreserve the space and go */
2341	if (new_size <= fs_info->max_extent_size) {
2342		spin_lock(&inode->lock);
2343		btrfs_mod_outstanding_extents(inode, -1);
2344		spin_unlock(&inode->lock);
2345		return;
2346	}
2347
2348	/*
2349	 * We have to add up either side to figure out how many extents were
2350	 * accounted for before we merged into one big extent.  If the number of
2351	 * extents we accounted for is <= the amount we need for the new range
2352	 * then we can return, otherwise drop.  Think of it like this
2353	 *
2354	 * [ 4k][MAX_SIZE]
2355	 *
2356	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2357	 * need 2 outstanding extents, on one side we have 1 and the other side
2358	 * we have 1 so they are == and we can return.  But in this case
2359	 *
2360	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2361	 *
2362	 * Each range on their own accounts for 2 extents, but merged together
2363	 * they are only 3 extents worth of accounting, so we need to drop in
2364	 * this case.
2365	 */
2366	old_size = other->end - other->start + 1;
2367	num_extents = count_max_extents(fs_info, old_size);
2368	old_size = new->end - new->start + 1;
2369	num_extents += count_max_extents(fs_info, old_size);
2370	if (count_max_extents(fs_info, new_size) >= num_extents)
2371		return;
2372
2373	spin_lock(&inode->lock);
2374	btrfs_mod_outstanding_extents(inode, -1);
2375	spin_unlock(&inode->lock);
2376}
2377
2378static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2379				      struct btrfs_inode *inode)
2380{
2381	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2382
2383	spin_lock(&root->delalloc_lock);
2384	if (list_empty(&inode->delalloc_inodes)) {
2385		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2386		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
2387		root->nr_delalloc_inodes++;
2388		if (root->nr_delalloc_inodes == 1) {
2389			spin_lock(&fs_info->delalloc_root_lock);
2390			BUG_ON(!list_empty(&root->delalloc_root));
2391			list_add_tail(&root->delalloc_root,
2392				      &fs_info->delalloc_roots);
2393			spin_unlock(&fs_info->delalloc_root_lock);
2394		}
2395	}
2396	spin_unlock(&root->delalloc_lock);
2397}
2398
2399void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2400				struct btrfs_inode *inode)
2401{
2402	struct btrfs_fs_info *fs_info = root->fs_info;
2403
2404	if (!list_empty(&inode->delalloc_inodes)) {
2405		list_del_init(&inode->delalloc_inodes);
2406		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2407			  &inode->runtime_flags);
2408		root->nr_delalloc_inodes--;
2409		if (!root->nr_delalloc_inodes) {
2410			ASSERT(list_empty(&root->delalloc_inodes));
2411			spin_lock(&fs_info->delalloc_root_lock);
2412			BUG_ON(list_empty(&root->delalloc_root));
2413			list_del_init(&root->delalloc_root);
2414			spin_unlock(&fs_info->delalloc_root_lock);
2415		}
2416	}
2417}
2418
2419static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2420				     struct btrfs_inode *inode)
2421{
2422	spin_lock(&root->delalloc_lock);
2423	__btrfs_del_delalloc_inode(root, inode);
2424	spin_unlock(&root->delalloc_lock);
2425}
2426
2427/*
2428 * Properly track delayed allocation bytes in the inode and to maintain the
2429 * list of inodes that have pending delalloc work to be done.
2430 */
2431void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2432			       u32 bits)
2433{
2434	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2435
2436	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2437		WARN_ON(1);
2438	/*
2439	 * set_bit and clear bit hooks normally require _irqsave/restore
2440	 * but in this case, we are only testing for the DELALLOC
2441	 * bit, which is only set or cleared with irqs on
2442	 */
2443	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2444		struct btrfs_root *root = inode->root;
2445		u64 len = state->end + 1 - state->start;
2446		u32 num_extents = count_max_extents(fs_info, len);
2447		bool do_list = !btrfs_is_free_space_inode(inode);
2448
2449		spin_lock(&inode->lock);
2450		btrfs_mod_outstanding_extents(inode, num_extents);
2451		spin_unlock(&inode->lock);
2452
2453		/* For sanity tests */
2454		if (btrfs_is_testing(fs_info))
2455			return;
2456
2457		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2458					 fs_info->delalloc_batch);
2459		spin_lock(&inode->lock);
2460		inode->delalloc_bytes += len;
2461		if (bits & EXTENT_DEFRAG)
2462			inode->defrag_bytes += len;
2463		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2464					 &inode->runtime_flags))
2465			btrfs_add_delalloc_inodes(root, inode);
2466		spin_unlock(&inode->lock);
2467	}
2468
2469	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2470	    (bits & EXTENT_DELALLOC_NEW)) {
2471		spin_lock(&inode->lock);
2472		inode->new_delalloc_bytes += state->end + 1 - state->start;
2473		spin_unlock(&inode->lock);
2474	}
2475}
2476
2477/*
2478 * Once a range is no longer delalloc this function ensures that proper
2479 * accounting happens.
2480 */
2481void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2482				 struct extent_state *state, u32 bits)
2483{
2484	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2485	u64 len = state->end + 1 - state->start;
2486	u32 num_extents = count_max_extents(fs_info, len);
2487
2488	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2489		spin_lock(&inode->lock);
2490		inode->defrag_bytes -= len;
2491		spin_unlock(&inode->lock);
2492	}
2493
2494	/*
2495	 * set_bit and clear bit hooks normally require _irqsave/restore
2496	 * but in this case, we are only testing for the DELALLOC
2497	 * bit, which is only set or cleared with irqs on
2498	 */
2499	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2500		struct btrfs_root *root = inode->root;
2501		bool do_list = !btrfs_is_free_space_inode(inode);
2502
2503		spin_lock(&inode->lock);
2504		btrfs_mod_outstanding_extents(inode, -num_extents);
2505		spin_unlock(&inode->lock);
2506
2507		/*
2508		 * We don't reserve metadata space for space cache inodes so we
2509		 * don't need to call delalloc_release_metadata if there is an
2510		 * error.
2511		 */
2512		if (bits & EXTENT_CLEAR_META_RESV &&
2513		    root != fs_info->tree_root)
2514			btrfs_delalloc_release_metadata(inode, len, false);
2515
2516		/* For sanity tests. */
2517		if (btrfs_is_testing(fs_info))
2518			return;
2519
2520		if (!btrfs_is_data_reloc_root(root) &&
2521		    do_list && !(state->state & EXTENT_NORESERVE) &&
2522		    (bits & EXTENT_CLEAR_DATA_RESV))
2523			btrfs_free_reserved_data_space_noquota(fs_info, len);
2524
2525		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2526					 fs_info->delalloc_batch);
2527		spin_lock(&inode->lock);
2528		inode->delalloc_bytes -= len;
2529		if (do_list && inode->delalloc_bytes == 0 &&
2530		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2531					&inode->runtime_flags))
2532			btrfs_del_delalloc_inode(root, inode);
2533		spin_unlock(&inode->lock);
2534	}
2535
2536	if ((state->state & EXTENT_DELALLOC_NEW) &&
2537	    (bits & EXTENT_DELALLOC_NEW)) {
2538		spin_lock(&inode->lock);
2539		ASSERT(inode->new_delalloc_bytes >= len);
2540		inode->new_delalloc_bytes -= len;
2541		if (bits & EXTENT_ADD_INODE_BYTES)
2542			inode_add_bytes(&inode->vfs_inode, len);
2543		spin_unlock(&inode->lock);
2544	}
2545}
2546
2547static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
2548					struct btrfs_ordered_extent *ordered)
2549{
2550	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
2551	u64 len = bbio->bio.bi_iter.bi_size;
2552	struct btrfs_ordered_extent *new;
2553	int ret;
2554
2555	/* Must always be called for the beginning of an ordered extent. */
2556	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
2557		return -EINVAL;
2558
2559	/* No need to split if the ordered extent covers the entire bio. */
2560	if (ordered->disk_num_bytes == len) {
2561		refcount_inc(&ordered->refs);
2562		bbio->ordered = ordered;
2563		return 0;
2564	}
2565
2566	/*
2567	 * Don't split the extent_map for NOCOW extents, as we're writing into
2568	 * a pre-existing one.
2569	 */
2570	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
2571		ret = split_extent_map(bbio->inode, bbio->file_offset,
2572				       ordered->num_bytes, len,
2573				       ordered->disk_bytenr);
2574		if (ret)
2575			return ret;
2576	}
2577
2578	new = btrfs_split_ordered_extent(ordered, len);
2579	if (IS_ERR(new))
2580		return PTR_ERR(new);
2581	bbio->ordered = new;
2582	return 0;
2583}
2584
2585/*
2586 * given a list of ordered sums record them in the inode.  This happens
2587 * at IO completion time based on sums calculated at bio submission time.
2588 */
2589static int add_pending_csums(struct btrfs_trans_handle *trans,
2590			     struct list_head *list)
2591{
2592	struct btrfs_ordered_sum *sum;
2593	struct btrfs_root *csum_root = NULL;
2594	int ret;
2595
2596	list_for_each_entry(sum, list, list) {
2597		trans->adding_csums = true;
2598		if (!csum_root)
2599			csum_root = btrfs_csum_root(trans->fs_info,
2600						    sum->logical);
2601		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2602		trans->adding_csums = false;
2603		if (ret)
2604			return ret;
2605	}
2606	return 0;
2607}
2608
2609static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2610					 const u64 start,
2611					 const u64 len,
2612					 struct extent_state **cached_state)
2613{
2614	u64 search_start = start;
2615	const u64 end = start + len - 1;
2616
2617	while (search_start < end) {
2618		const u64 search_len = end - search_start + 1;
2619		struct extent_map *em;
2620		u64 em_len;
2621		int ret = 0;
2622
2623		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2624		if (IS_ERR(em))
2625			return PTR_ERR(em);
2626
2627		if (em->block_start != EXTENT_MAP_HOLE)
2628			goto next;
2629
2630		em_len = em->len;
2631		if (em->start < search_start)
2632			em_len -= search_start - em->start;
2633		if (em_len > search_len)
2634			em_len = search_len;
2635
2636		ret = set_extent_bit(&inode->io_tree, search_start,
2637				     search_start + em_len - 1,
2638				     EXTENT_DELALLOC_NEW, cached_state);
2639next:
2640		search_start = extent_map_end(em);
2641		free_extent_map(em);
2642		if (ret)
2643			return ret;
2644	}
2645	return 0;
2646}
2647
2648int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2649			      unsigned int extra_bits,
2650			      struct extent_state **cached_state)
2651{
2652	WARN_ON(PAGE_ALIGNED(end));
2653
2654	if (start >= i_size_read(&inode->vfs_inode) &&
2655	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2656		/*
2657		 * There can't be any extents following eof in this case so just
2658		 * set the delalloc new bit for the range directly.
2659		 */
2660		extra_bits |= EXTENT_DELALLOC_NEW;
2661	} else {
2662		int ret;
2663
2664		ret = btrfs_find_new_delalloc_bytes(inode, start,
2665						    end + 1 - start,
2666						    cached_state);
2667		if (ret)
2668			return ret;
2669	}
2670
2671	return set_extent_bit(&inode->io_tree, start, end,
2672			      EXTENT_DELALLOC | extra_bits, cached_state);
2673}
2674
2675/* see btrfs_writepage_start_hook for details on why this is required */
2676struct btrfs_writepage_fixup {
2677	struct page *page;
2678	struct btrfs_inode *inode;
2679	struct btrfs_work work;
2680};
2681
2682static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2683{
2684	struct btrfs_writepage_fixup *fixup =
2685		container_of(work, struct btrfs_writepage_fixup, work);
2686	struct btrfs_ordered_extent *ordered;
2687	struct extent_state *cached_state = NULL;
2688	struct extent_changeset *data_reserved = NULL;
2689	struct page *page = fixup->page;
2690	struct btrfs_inode *inode = fixup->inode;
2691	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2692	u64 page_start = page_offset(page);
2693	u64 page_end = page_offset(page) + PAGE_SIZE - 1;
2694	int ret = 0;
2695	bool free_delalloc_space = true;
2696
2697	/*
2698	 * This is similar to page_mkwrite, we need to reserve the space before
2699	 * we take the page lock.
2700	 */
2701	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2702					   PAGE_SIZE);
2703again:
2704	lock_page(page);
2705
2706	/*
2707	 * Before we queued this fixup, we took a reference on the page.
2708	 * page->mapping may go NULL, but it shouldn't be moved to a different
2709	 * address space.
2710	 */
2711	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2712		/*
2713		 * Unfortunately this is a little tricky, either
2714		 *
2715		 * 1) We got here and our page had already been dealt with and
2716		 *    we reserved our space, thus ret == 0, so we need to just
2717		 *    drop our space reservation and bail.  This can happen the
2718		 *    first time we come into the fixup worker, or could happen
2719		 *    while waiting for the ordered extent.
2720		 * 2) Our page was already dealt with, but we happened to get an
2721		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2722		 *    this case we obviously don't have anything to release, but
2723		 *    because the page was already dealt with we don't want to
2724		 *    mark the page with an error, so make sure we're resetting
2725		 *    ret to 0.  This is why we have this check _before_ the ret
2726		 *    check, because we do not want to have a surprise ENOSPC
2727		 *    when the page was already properly dealt with.
2728		 */
2729		if (!ret) {
2730			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2731			btrfs_delalloc_release_space(inode, data_reserved,
2732						     page_start, PAGE_SIZE,
2733						     true);
2734		}
2735		ret = 0;
2736		goto out_page;
2737	}
2738
2739	/*
2740	 * We can't mess with the page state unless it is locked, so now that
2741	 * it is locked bail if we failed to make our space reservation.
2742	 */
2743	if (ret)
2744		goto out_page;
2745
2746	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2747
2748	/* already ordered? We're done */
2749	if (PageOrdered(page))
2750		goto out_reserved;
2751
2752	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2753	if (ordered) {
2754		unlock_extent(&inode->io_tree, page_start, page_end,
2755			      &cached_state);
2756		unlock_page(page);
2757		btrfs_start_ordered_extent(ordered);
2758		btrfs_put_ordered_extent(ordered);
2759		goto again;
2760	}
2761
2762	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2763					&cached_state);
2764	if (ret)
2765		goto out_reserved;
2766
2767	/*
2768	 * Everything went as planned, we're now the owner of a dirty page with
2769	 * delayed allocation bits set and space reserved for our COW
2770	 * destination.
2771	 *
2772	 * The page was dirty when we started, nothing should have cleaned it.
2773	 */
2774	BUG_ON(!PageDirty(page));
2775	free_delalloc_space = false;
2776out_reserved:
2777	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2778	if (free_delalloc_space)
2779		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2780					     PAGE_SIZE, true);
2781	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2782out_page:
2783	if (ret) {
2784		/*
2785		 * We hit ENOSPC or other errors.  Update the mapping and page
2786		 * to reflect the errors and clean the page.
2787		 */
2788		mapping_set_error(page->mapping, ret);
2789		btrfs_mark_ordered_io_finished(inode, page, page_start,
2790					       PAGE_SIZE, !ret);
2791		clear_page_dirty_for_io(page);
2792	}
2793	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
2794	unlock_page(page);
2795	put_page(page);
2796	kfree(fixup);
2797	extent_changeset_free(data_reserved);
2798	/*
2799	 * As a precaution, do a delayed iput in case it would be the last iput
2800	 * that could need flushing space. Recursing back to fixup worker would
2801	 * deadlock.
2802	 */
2803	btrfs_add_delayed_iput(inode);
2804}
2805
2806/*
2807 * There are a few paths in the higher layers of the kernel that directly
2808 * set the page dirty bit without asking the filesystem if it is a
2809 * good idea.  This causes problems because we want to make sure COW
2810 * properly happens and the data=ordered rules are followed.
2811 *
2812 * In our case any range that doesn't have the ORDERED bit set
2813 * hasn't been properly setup for IO.  We kick off an async process
2814 * to fix it up.  The async helper will wait for ordered extents, set
2815 * the delalloc bit and make it safe to write the page.
2816 */
2817int btrfs_writepage_cow_fixup(struct page *page)
2818{
2819	struct inode *inode = page->mapping->host;
2820	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2821	struct btrfs_writepage_fixup *fixup;
2822
2823	/* This page has ordered extent covering it already */
2824	if (PageOrdered(page))
2825		return 0;
2826
2827	/*
2828	 * PageChecked is set below when we create a fixup worker for this page,
2829	 * don't try to create another one if we're already PageChecked()
2830	 *
2831	 * The extent_io writepage code will redirty the page if we send back
2832	 * EAGAIN.
2833	 */
2834	if (PageChecked(page))
2835		return -EAGAIN;
2836
2837	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2838	if (!fixup)
2839		return -EAGAIN;
2840
2841	/*
2842	 * We are already holding a reference to this inode from
2843	 * write_cache_pages.  We need to hold it because the space reservation
2844	 * takes place outside of the page lock, and we can't trust
2845	 * page->mapping outside of the page lock.
2846	 */
2847	ihold(inode);
2848	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2849	get_page(page);
2850	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2851	fixup->page = page;
2852	fixup->inode = BTRFS_I(inode);
2853	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2854
2855	return -EAGAIN;
2856}
2857
2858static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2859				       struct btrfs_inode *inode, u64 file_pos,
2860				       struct btrfs_file_extent_item *stack_fi,
2861				       const bool update_inode_bytes,
2862				       u64 qgroup_reserved)
2863{
2864	struct btrfs_root *root = inode->root;
2865	const u64 sectorsize = root->fs_info->sectorsize;
2866	struct btrfs_path *path;
2867	struct extent_buffer *leaf;
2868	struct btrfs_key ins;
2869	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2870	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2871	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2872	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2873	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2874	struct btrfs_drop_extents_args drop_args = { 0 };
2875	int ret;
2876
2877	path = btrfs_alloc_path();
2878	if (!path)
2879		return -ENOMEM;
2880
2881	/*
2882	 * we may be replacing one extent in the tree with another.
2883	 * The new extent is pinned in the extent map, and we don't want
2884	 * to drop it from the cache until it is completely in the btree.
2885	 *
2886	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2887	 * the caller is expected to unpin it and allow it to be merged
2888	 * with the others.
2889	 */
2890	drop_args.path = path;
2891	drop_args.start = file_pos;
2892	drop_args.end = file_pos + num_bytes;
2893	drop_args.replace_extent = true;
2894	drop_args.extent_item_size = sizeof(*stack_fi);
2895	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2896	if (ret)
2897		goto out;
2898
2899	if (!drop_args.extent_inserted) {
2900		ins.objectid = btrfs_ino(inode);
2901		ins.offset = file_pos;
2902		ins.type = BTRFS_EXTENT_DATA_KEY;
2903
2904		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2905					      sizeof(*stack_fi));
2906		if (ret)
2907			goto out;
2908	}
2909	leaf = path->nodes[0];
2910	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2911	write_extent_buffer(leaf, stack_fi,
2912			btrfs_item_ptr_offset(leaf, path->slots[0]),
2913			sizeof(struct btrfs_file_extent_item));
2914
2915	btrfs_mark_buffer_dirty(trans, leaf);
2916	btrfs_release_path(path);
2917
2918	/*
2919	 * If we dropped an inline extent here, we know the range where it is
2920	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2921	 * number of bytes only for that range containing the inline extent.
2922	 * The remaining of the range will be processed when clearning the
2923	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2924	 */
2925	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2926		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2927
2928		inline_size = drop_args.bytes_found - inline_size;
2929		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2930		drop_args.bytes_found -= inline_size;
2931		num_bytes -= sectorsize;
2932	}
2933
2934	if (update_inode_bytes)
2935		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2936
2937	ins.objectid = disk_bytenr;
2938	ins.offset = disk_num_bytes;
2939	ins.type = BTRFS_EXTENT_ITEM_KEY;
2940
2941	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2942	if (ret)
2943		goto out;
2944
2945	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2946					       file_pos - offset,
2947					       qgroup_reserved, &ins);
2948out:
2949	btrfs_free_path(path);
2950
2951	return ret;
2952}
2953
2954static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2955					 u64 start, u64 len)
2956{
2957	struct btrfs_block_group *cache;
2958
2959	cache = btrfs_lookup_block_group(fs_info, start);
2960	ASSERT(cache);
2961
2962	spin_lock(&cache->lock);
2963	cache->delalloc_bytes -= len;
2964	spin_unlock(&cache->lock);
2965
2966	btrfs_put_block_group(cache);
2967}
2968
2969static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2970					     struct btrfs_ordered_extent *oe)
2971{
2972	struct btrfs_file_extent_item stack_fi;
2973	bool update_inode_bytes;
2974	u64 num_bytes = oe->num_bytes;
2975	u64 ram_bytes = oe->ram_bytes;
2976
2977	memset(&stack_fi, 0, sizeof(stack_fi));
2978	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2979	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2980	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2981						   oe->disk_num_bytes);
2982	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
2983	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
2984		num_bytes = oe->truncated_len;
2985		ram_bytes = num_bytes;
2986	}
2987	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
2988	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
2989	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2990	/* Encryption and other encoding is reserved and all 0 */
2991
2992	/*
2993	 * For delalloc, when completing an ordered extent we update the inode's
2994	 * bytes when clearing the range in the inode's io tree, so pass false
2995	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2996	 * except if the ordered extent was truncated.
2997	 */
2998	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
2999			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3000			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3001
3002	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3003					   oe->file_offset, &stack_fi,
3004					   update_inode_bytes, oe->qgroup_rsv);
3005}
3006
3007/*
3008 * As ordered data IO finishes, this gets called so we can finish
3009 * an ordered extent if the range of bytes in the file it covers are
3010 * fully written.
3011 */
3012int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3013{
3014	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3015	struct btrfs_root *root = inode->root;
3016	struct btrfs_fs_info *fs_info = root->fs_info;
3017	struct btrfs_trans_handle *trans = NULL;
3018	struct extent_io_tree *io_tree = &inode->io_tree;
3019	struct extent_state *cached_state = NULL;
3020	u64 start, end;
3021	int compress_type = 0;
3022	int ret = 0;
3023	u64 logical_len = ordered_extent->num_bytes;
3024	bool freespace_inode;
3025	bool truncated = false;
3026	bool clear_reserved_extent = true;
3027	unsigned int clear_bits = EXTENT_DEFRAG;
3028
3029	start = ordered_extent->file_offset;
3030	end = start + ordered_extent->num_bytes - 1;
3031
3032	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3033	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3034	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3035	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3036		clear_bits |= EXTENT_DELALLOC_NEW;
3037
3038	freespace_inode = btrfs_is_free_space_inode(inode);
3039	if (!freespace_inode)
3040		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3041
3042	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3043		ret = -EIO;
3044		goto out;
3045	}
3046
3047	if (btrfs_is_zoned(fs_info))
3048		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3049					ordered_extent->disk_num_bytes);
3050
3051	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3052		truncated = true;
3053		logical_len = ordered_extent->truncated_len;
3054		/* Truncated the entire extent, don't bother adding */
3055		if (!logical_len)
3056			goto out;
3057	}
3058
3059	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3060		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3061
3062		btrfs_inode_safe_disk_i_size_write(inode, 0);
3063		if (freespace_inode)
3064			trans = btrfs_join_transaction_spacecache(root);
3065		else
3066			trans = btrfs_join_transaction(root);
3067		if (IS_ERR(trans)) {
3068			ret = PTR_ERR(trans);
3069			trans = NULL;
3070			goto out;
3071		}
3072		trans->block_rsv = &inode->block_rsv;
3073		ret = btrfs_update_inode_fallback(trans, root, inode);
3074		if (ret) /* -ENOMEM or corruption */
3075			btrfs_abort_transaction(trans, ret);
3076		goto out;
3077	}
3078
3079	clear_bits |= EXTENT_LOCKED;
3080	lock_extent(io_tree, start, end, &cached_state);
3081
3082	if (freespace_inode)
3083		trans = btrfs_join_transaction_spacecache(root);
3084	else
3085		trans = btrfs_join_transaction(root);
3086	if (IS_ERR(trans)) {
3087		ret = PTR_ERR(trans);
3088		trans = NULL;
3089		goto out;
3090	}
3091
3092	trans->block_rsv = &inode->block_rsv;
3093
3094	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3095		compress_type = ordered_extent->compress_type;
3096	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3097		BUG_ON(compress_type);
3098		ret = btrfs_mark_extent_written(trans, inode,
3099						ordered_extent->file_offset,
3100						ordered_extent->file_offset +
3101						logical_len);
3102		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3103						  ordered_extent->disk_num_bytes);
3104	} else {
3105		BUG_ON(root == fs_info->tree_root);
3106		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3107		if (!ret) {
3108			clear_reserved_extent = false;
3109			btrfs_release_delalloc_bytes(fs_info,
3110						ordered_extent->disk_bytenr,
3111						ordered_extent->disk_num_bytes);
3112		}
3113	}
3114	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3115			   ordered_extent->num_bytes, trans->transid);
3116	if (ret < 0) {
3117		btrfs_abort_transaction(trans, ret);
3118		goto out;
3119	}
3120
3121	ret = add_pending_csums(trans, &ordered_extent->list);
3122	if (ret) {
3123		btrfs_abort_transaction(trans, ret);
3124		goto out;
3125	}
3126
3127	/*
3128	 * If this is a new delalloc range, clear its new delalloc flag to
3129	 * update the inode's number of bytes. This needs to be done first
3130	 * before updating the inode item.
3131	 */
3132	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3133	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3134		clear_extent_bit(&inode->io_tree, start, end,
3135				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3136				 &cached_state);
3137
3138	btrfs_inode_safe_disk_i_size_write(inode, 0);
3139	ret = btrfs_update_inode_fallback(trans, root, inode);
3140	if (ret) { /* -ENOMEM or corruption */
3141		btrfs_abort_transaction(trans, ret);
3142		goto out;
3143	}
3144	ret = 0;
3145out:
3146	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3147			 &cached_state);
3148
3149	if (trans)
3150		btrfs_end_transaction(trans);
3151
3152	if (ret || truncated) {
3153		u64 unwritten_start = start;
3154
3155		/*
3156		 * If we failed to finish this ordered extent for any reason we
3157		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3158		 * extent, and mark the inode with the error if it wasn't
3159		 * already set.  Any error during writeback would have already
3160		 * set the mapping error, so we need to set it if we're the ones
3161		 * marking this ordered extent as failed.
3162		 */
3163		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3164					     &ordered_extent->flags))
3165			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3166
3167		if (truncated)
3168			unwritten_start += logical_len;
3169		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3170
3171		/*
3172		 * Drop extent maps for the part of the extent we didn't write.
3173		 *
3174		 * We have an exception here for the free_space_inode, this is
3175		 * because when we do btrfs_get_extent() on the free space inode
3176		 * we will search the commit root.  If this is a new block group
3177		 * we won't find anything, and we will trip over the assert in
3178		 * writepage where we do ASSERT(em->block_start !=
3179		 * EXTENT_MAP_HOLE).
3180		 *
3181		 * Theoretically we could also skip this for any NOCOW extent as
3182		 * we don't mess with the extent map tree in the NOCOW case, but
3183		 * for now simply skip this if we are the free space inode.
3184		 */
3185		if (!btrfs_is_free_space_inode(inode))
3186			btrfs_drop_extent_map_range(inode, unwritten_start,
3187						    end, false);
3188
3189		/*
3190		 * If the ordered extent had an IOERR or something else went
3191		 * wrong we need to return the space for this ordered extent
3192		 * back to the allocator.  We only free the extent in the
3193		 * truncated case if we didn't write out the extent at all.
3194		 *
3195		 * If we made it past insert_reserved_file_extent before we
3196		 * errored out then we don't need to do this as the accounting
3197		 * has already been done.
3198		 */
3199		if ((ret || !logical_len) &&
3200		    clear_reserved_extent &&
3201		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3202		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3203			/*
3204			 * Discard the range before returning it back to the
3205			 * free space pool
3206			 */
3207			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3208				btrfs_discard_extent(fs_info,
3209						ordered_extent->disk_bytenr,
3210						ordered_extent->disk_num_bytes,
3211						NULL);
3212			btrfs_free_reserved_extent(fs_info,
3213					ordered_extent->disk_bytenr,
3214					ordered_extent->disk_num_bytes, 1);
3215			/*
3216			 * Actually free the qgroup rsv which was released when
3217			 * the ordered extent was created.
3218			 */
3219			btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
3220						  ordered_extent->qgroup_rsv,
3221						  BTRFS_QGROUP_RSV_DATA);
3222		}
3223	}
3224
3225	/*
3226	 * This needs to be done to make sure anybody waiting knows we are done
3227	 * updating everything for this ordered extent.
3228	 */
3229	btrfs_remove_ordered_extent(inode, ordered_extent);
3230
3231	/* once for us */
3232	btrfs_put_ordered_extent(ordered_extent);
3233	/* once for the tree */
3234	btrfs_put_ordered_extent(ordered_extent);
3235
3236	return ret;
3237}
3238
3239int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3240{
3241	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
3242	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
3243		btrfs_finish_ordered_zoned(ordered);
3244	return btrfs_finish_one_ordered(ordered);
3245}
3246
3247/*
3248 * Verify the checksum for a single sector without any extra action that depend
3249 * on the type of I/O.
3250 */
3251int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3252			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
3253{
3254	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3255	char *kaddr;
3256
3257	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3258
3259	shash->tfm = fs_info->csum_shash;
3260
3261	kaddr = kmap_local_page(page) + pgoff;
3262	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3263	kunmap_local(kaddr);
3264
3265	if (memcmp(csum, csum_expected, fs_info->csum_size))
3266		return -EIO;
3267	return 0;
3268}
3269
3270/*
3271 * Verify the checksum of a single data sector.
3272 *
3273 * @bbio:	btrfs_io_bio which contains the csum
3274 * @dev:	device the sector is on
3275 * @bio_offset:	offset to the beginning of the bio (in bytes)
3276 * @bv:		bio_vec to check
3277 *
3278 * Check if the checksum on a data block is valid.  When a checksum mismatch is
3279 * detected, report the error and fill the corrupted range with zero.
3280 *
3281 * Return %true if the sector is ok or had no checksum to start with, else %false.
3282 */
3283bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3284			u32 bio_offset, struct bio_vec *bv)
3285{
3286	struct btrfs_inode *inode = bbio->inode;
3287	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3288	u64 file_offset = bbio->file_offset + bio_offset;
3289	u64 end = file_offset + bv->bv_len - 1;
3290	u8 *csum_expected;
3291	u8 csum[BTRFS_CSUM_SIZE];
3292
3293	ASSERT(bv->bv_len == fs_info->sectorsize);
3294
3295	if (!bbio->csum)
3296		return true;
3297
3298	if (btrfs_is_data_reloc_root(inode->root) &&
3299	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3300			   1, NULL)) {
3301		/* Skip the range without csum for data reloc inode */
3302		clear_extent_bits(&inode->io_tree, file_offset, end,
3303				  EXTENT_NODATASUM);
3304		return true;
3305	}
3306
3307	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3308				fs_info->csum_size;
3309	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3310				    csum_expected))
3311		goto zeroit;
3312	return true;
3313
3314zeroit:
3315	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3316				    bbio->mirror_num);
3317	if (dev)
3318		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3319	memzero_bvec(bv);
3320	return false;
3321}
3322
3323/*
3324 * btrfs_add_delayed_iput - perform a delayed iput on @inode
3325 *
3326 * @inode: The inode we want to perform iput on
3327 *
3328 * This function uses the generic vfs_inode::i_count to track whether we should
3329 * just decrement it (in case it's > 1) or if this is the last iput then link
3330 * the inode to the delayed iput machinery. Delayed iputs are processed at
3331 * transaction commit time/superblock commit/cleaner kthread.
3332 */
3333void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3334{
3335	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3336	unsigned long flags;
3337
3338	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3339		return;
3340
3341	atomic_inc(&fs_info->nr_delayed_iputs);
3342	/*
3343	 * Need to be irq safe here because we can be called from either an irq
3344	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3345	 * context.
3346	 */
3347	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3348	ASSERT(list_empty(&inode->delayed_iput));
3349	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3350	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3351	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3352		wake_up_process(fs_info->cleaner_kthread);
3353}
3354
3355static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3356				    struct btrfs_inode *inode)
3357{
3358	list_del_init(&inode->delayed_iput);
3359	spin_unlock_irq(&fs_info->delayed_iput_lock);
3360	iput(&inode->vfs_inode);
3361	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3362		wake_up(&fs_info->delayed_iputs_wait);
3363	spin_lock_irq(&fs_info->delayed_iput_lock);
3364}
3365
3366static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3367				   struct btrfs_inode *inode)
3368{
3369	if (!list_empty(&inode->delayed_iput)) {
3370		spin_lock_irq(&fs_info->delayed_iput_lock);
3371		if (!list_empty(&inode->delayed_iput))
3372			run_delayed_iput_locked(fs_info, inode);
3373		spin_unlock_irq(&fs_info->delayed_iput_lock);
3374	}
3375}
3376
3377void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3378{
3379	/*
3380	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3381	 * calls btrfs_add_delayed_iput() and that needs to lock
3382	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3383	 * prevent a deadlock.
3384	 */
3385	spin_lock_irq(&fs_info->delayed_iput_lock);
3386	while (!list_empty(&fs_info->delayed_iputs)) {
3387		struct btrfs_inode *inode;
3388
3389		inode = list_first_entry(&fs_info->delayed_iputs,
3390				struct btrfs_inode, delayed_iput);
3391		run_delayed_iput_locked(fs_info, inode);
3392		if (need_resched()) {
3393			spin_unlock_irq(&fs_info->delayed_iput_lock);
3394			cond_resched();
3395			spin_lock_irq(&fs_info->delayed_iput_lock);
3396		}
3397	}
3398	spin_unlock_irq(&fs_info->delayed_iput_lock);
3399}
3400
3401/*
3402 * Wait for flushing all delayed iputs
3403 *
3404 * @fs_info:  the filesystem
3405 *
3406 * This will wait on any delayed iputs that are currently running with KILLABLE
3407 * set.  Once they are all done running we will return, unless we are killed in
3408 * which case we return EINTR. This helps in user operations like fallocate etc
3409 * that might get blocked on the iputs.
3410 *
3411 * Return EINTR if we were killed, 0 if nothing's pending
3412 */
3413int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3414{
3415	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3416			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3417	if (ret)
3418		return -EINTR;
3419	return 0;
3420}
3421
3422/*
3423 * This creates an orphan entry for the given inode in case something goes wrong
3424 * in the middle of an unlink.
3425 */
3426int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3427		     struct btrfs_inode *inode)
3428{
3429	int ret;
3430
3431	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3432	if (ret && ret != -EEXIST) {
3433		btrfs_abort_transaction(trans, ret);
3434		return ret;
3435	}
3436
3437	return 0;
3438}
3439
3440/*
3441 * We have done the delete so we can go ahead and remove the orphan item for
3442 * this particular inode.
3443 */
3444static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3445			    struct btrfs_inode *inode)
3446{
3447	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3448}
3449
3450/*
3451 * this cleans up any orphans that may be left on the list from the last use
3452 * of this root.
3453 */
3454int btrfs_orphan_cleanup(struct btrfs_root *root)
3455{
3456	struct btrfs_fs_info *fs_info = root->fs_info;
3457	struct btrfs_path *path;
3458	struct extent_buffer *leaf;
3459	struct btrfs_key key, found_key;
3460	struct btrfs_trans_handle *trans;
3461	struct inode *inode;
3462	u64 last_objectid = 0;
3463	int ret = 0, nr_unlink = 0;
3464
3465	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3466		return 0;
3467
3468	path = btrfs_alloc_path();
3469	if (!path) {
3470		ret = -ENOMEM;
3471		goto out;
3472	}
3473	path->reada = READA_BACK;
3474
3475	key.objectid = BTRFS_ORPHAN_OBJECTID;
3476	key.type = BTRFS_ORPHAN_ITEM_KEY;
3477	key.offset = (u64)-1;
3478
3479	while (1) {
3480		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3481		if (ret < 0)
3482			goto out;
3483
3484		/*
3485		 * if ret == 0 means we found what we were searching for, which
3486		 * is weird, but possible, so only screw with path if we didn't
3487		 * find the key and see if we have stuff that matches
3488		 */
3489		if (ret > 0) {
3490			ret = 0;
3491			if (path->slots[0] == 0)
3492				break;
3493			path->slots[0]--;
3494		}
3495
3496		/* pull out the item */
3497		leaf = path->nodes[0];
3498		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3499
3500		/* make sure the item matches what we want */
3501		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3502			break;
3503		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3504			break;
3505
3506		/* release the path since we're done with it */
3507		btrfs_release_path(path);
3508
3509		/*
3510		 * this is where we are basically btrfs_lookup, without the
3511		 * crossing root thing.  we store the inode number in the
3512		 * offset of the orphan item.
3513		 */
3514
3515		if (found_key.offset == last_objectid) {
3516			/*
3517			 * We found the same inode as before. This means we were
3518			 * not able to remove its items via eviction triggered
3519			 * by an iput(). A transaction abort may have happened,
3520			 * due to -ENOSPC for example, so try to grab the error
3521			 * that lead to a transaction abort, if any.
3522			 */
3523			btrfs_err(fs_info,
3524				  "Error removing orphan entry, stopping orphan cleanup");
3525			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3526			goto out;
3527		}
3528
3529		last_objectid = found_key.offset;
3530
3531		found_key.objectid = found_key.offset;
3532		found_key.type = BTRFS_INODE_ITEM_KEY;
3533		found_key.offset = 0;
3534		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3535		if (IS_ERR(inode)) {
3536			ret = PTR_ERR(inode);
3537			inode = NULL;
3538			if (ret != -ENOENT)
3539				goto out;
3540		}
3541
3542		if (!inode && root == fs_info->tree_root) {
3543			struct btrfs_root *dead_root;
3544			int is_dead_root = 0;
3545
3546			/*
3547			 * This is an orphan in the tree root. Currently these
3548			 * could come from 2 sources:
3549			 *  a) a root (snapshot/subvolume) deletion in progress
3550			 *  b) a free space cache inode
3551			 * We need to distinguish those two, as the orphan item
3552			 * for a root must not get deleted before the deletion
3553			 * of the snapshot/subvolume's tree completes.
3554			 *
3555			 * btrfs_find_orphan_roots() ran before us, which has
3556			 * found all deleted roots and loaded them into
3557			 * fs_info->fs_roots_radix. So here we can find if an
3558			 * orphan item corresponds to a deleted root by looking
3559			 * up the root from that radix tree.
3560			 */
3561
3562			spin_lock(&fs_info->fs_roots_radix_lock);
3563			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3564							 (unsigned long)found_key.objectid);
3565			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3566				is_dead_root = 1;
3567			spin_unlock(&fs_info->fs_roots_radix_lock);
3568
3569			if (is_dead_root) {
3570				/* prevent this orphan from being found again */
3571				key.offset = found_key.objectid - 1;
3572				continue;
3573			}
3574
3575		}
3576
3577		/*
3578		 * If we have an inode with links, there are a couple of
3579		 * possibilities:
3580		 *
3581		 * 1. We were halfway through creating fsverity metadata for the
3582		 * file. In that case, the orphan item represents incomplete
3583		 * fsverity metadata which must be cleaned up with
3584		 * btrfs_drop_verity_items and deleting the orphan item.
3585
3586		 * 2. Old kernels (before v3.12) used to create an
3587		 * orphan item for truncate indicating that there were possibly
3588		 * extent items past i_size that needed to be deleted. In v3.12,
3589		 * truncate was changed to update i_size in sync with the extent
3590		 * items, but the (useless) orphan item was still created. Since
3591		 * v4.18, we don't create the orphan item for truncate at all.
3592		 *
3593		 * So, this item could mean that we need to do a truncate, but
3594		 * only if this filesystem was last used on a pre-v3.12 kernel
3595		 * and was not cleanly unmounted. The odds of that are quite
3596		 * slim, and it's a pain to do the truncate now, so just delete
3597		 * the orphan item.
3598		 *
3599		 * It's also possible that this orphan item was supposed to be
3600		 * deleted but wasn't. The inode number may have been reused,
3601		 * but either way, we can delete the orphan item.
3602		 */
3603		if (!inode || inode->i_nlink) {
3604			if (inode) {
3605				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3606				iput(inode);
3607				inode = NULL;
3608				if (ret)
3609					goto out;
3610			}
3611			trans = btrfs_start_transaction(root, 1);
3612			if (IS_ERR(trans)) {
3613				ret = PTR_ERR(trans);
3614				goto out;
3615			}
3616			btrfs_debug(fs_info, "auto deleting %Lu",
3617				    found_key.objectid);
3618			ret = btrfs_del_orphan_item(trans, root,
3619						    found_key.objectid);
3620			btrfs_end_transaction(trans);
3621			if (ret)
3622				goto out;
3623			continue;
3624		}
3625
3626		nr_unlink++;
3627
3628		/* this will do delete_inode and everything for us */
3629		iput(inode);
3630	}
3631	/* release the path since we're done with it */
3632	btrfs_release_path(path);
3633
3634	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3635		trans = btrfs_join_transaction(root);
3636		if (!IS_ERR(trans))
3637			btrfs_end_transaction(trans);
3638	}
3639
3640	if (nr_unlink)
3641		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3642
3643out:
3644	if (ret)
3645		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3646	btrfs_free_path(path);
3647	return ret;
3648}
3649
3650/*
3651 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3652 * don't find any xattrs, we know there can't be any acls.
3653 *
3654 * slot is the slot the inode is in, objectid is the objectid of the inode
3655 */
3656static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3657					  int slot, u64 objectid,
3658					  int *first_xattr_slot)
3659{
3660	u32 nritems = btrfs_header_nritems(leaf);
3661	struct btrfs_key found_key;
3662	static u64 xattr_access = 0;
3663	static u64 xattr_default = 0;
3664	int scanned = 0;
3665
3666	if (!xattr_access) {
3667		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3668					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3669		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3670					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3671	}
3672
3673	slot++;
3674	*first_xattr_slot = -1;
3675	while (slot < nritems) {
3676		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3677
3678		/* we found a different objectid, there must not be acls */
3679		if (found_key.objectid != objectid)
3680			return 0;
3681
3682		/* we found an xattr, assume we've got an acl */
3683		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3684			if (*first_xattr_slot == -1)
3685				*first_xattr_slot = slot;
3686			if (found_key.offset == xattr_access ||
3687			    found_key.offset == xattr_default)
3688				return 1;
3689		}
3690
3691		/*
3692		 * we found a key greater than an xattr key, there can't
3693		 * be any acls later on
3694		 */
3695		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3696			return 0;
3697
3698		slot++;
3699		scanned++;
3700
3701		/*
3702		 * it goes inode, inode backrefs, xattrs, extents,
3703		 * so if there are a ton of hard links to an inode there can
3704		 * be a lot of backrefs.  Don't waste time searching too hard,
3705		 * this is just an optimization
3706		 */
3707		if (scanned >= 8)
3708			break;
3709	}
3710	/* we hit the end of the leaf before we found an xattr or
3711	 * something larger than an xattr.  We have to assume the inode
3712	 * has acls
3713	 */
3714	if (*first_xattr_slot == -1)
3715		*first_xattr_slot = slot;
3716	return 1;
3717}
3718
3719/*
3720 * read an inode from the btree into the in-memory inode
3721 */
3722static int btrfs_read_locked_inode(struct inode *inode,
3723				   struct btrfs_path *in_path)
3724{
3725	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3726	struct btrfs_path *path = in_path;
3727	struct extent_buffer *leaf;
3728	struct btrfs_inode_item *inode_item;
3729	struct btrfs_root *root = BTRFS_I(inode)->root;
3730	struct btrfs_key location;
3731	unsigned long ptr;
3732	int maybe_acls;
3733	u32 rdev;
3734	int ret;
3735	bool filled = false;
3736	int first_xattr_slot;
3737
3738	ret = btrfs_fill_inode(inode, &rdev);
3739	if (!ret)
3740		filled = true;
3741
3742	if (!path) {
3743		path = btrfs_alloc_path();
3744		if (!path)
3745			return -ENOMEM;
3746	}
3747
3748	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3749
3750	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3751	if (ret) {
3752		if (path != in_path)
3753			btrfs_free_path(path);
3754		return ret;
3755	}
3756
3757	leaf = path->nodes[0];
3758
3759	if (filled)
3760		goto cache_index;
3761
3762	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3763				    struct btrfs_inode_item);
3764	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3765	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3766	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3767	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3768	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3769	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3770			round_up(i_size_read(inode), fs_info->sectorsize));
3771
3772	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3773	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3774
3775	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3776	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3777
3778	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3779			btrfs_timespec_nsec(leaf, &inode_item->ctime));
3780
3781	BTRFS_I(inode)->i_otime.tv_sec =
3782		btrfs_timespec_sec(leaf, &inode_item->otime);
3783	BTRFS_I(inode)->i_otime.tv_nsec =
3784		btrfs_timespec_nsec(leaf, &inode_item->otime);
3785
3786	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3787	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3788	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3789
3790	inode_set_iversion_queried(inode,
3791				   btrfs_inode_sequence(leaf, inode_item));
3792	inode->i_generation = BTRFS_I(inode)->generation;
3793	inode->i_rdev = 0;
3794	rdev = btrfs_inode_rdev(leaf, inode_item);
3795
3796	BTRFS_I(inode)->index_cnt = (u64)-1;
3797	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3798				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3799
3800cache_index:
3801	/*
3802	 * If we were modified in the current generation and evicted from memory
3803	 * and then re-read we need to do a full sync since we don't have any
3804	 * idea about which extents were modified before we were evicted from
3805	 * cache.
3806	 *
3807	 * This is required for both inode re-read from disk and delayed inode
3808	 * in delayed_nodes_tree.
3809	 */
3810	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3811		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3812			&BTRFS_I(inode)->runtime_flags);
3813
3814	/*
3815	 * We don't persist the id of the transaction where an unlink operation
3816	 * against the inode was last made. So here we assume the inode might
3817	 * have been evicted, and therefore the exact value of last_unlink_trans
3818	 * lost, and set it to last_trans to avoid metadata inconsistencies
3819	 * between the inode and its parent if the inode is fsync'ed and the log
3820	 * replayed. For example, in the scenario:
3821	 *
3822	 * touch mydir/foo
3823	 * ln mydir/foo mydir/bar
3824	 * sync
3825	 * unlink mydir/bar
3826	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3827	 * xfs_io -c fsync mydir/foo
3828	 * <power failure>
3829	 * mount fs, triggers fsync log replay
3830	 *
3831	 * We must make sure that when we fsync our inode foo we also log its
3832	 * parent inode, otherwise after log replay the parent still has the
3833	 * dentry with the "bar" name but our inode foo has a link count of 1
3834	 * and doesn't have an inode ref with the name "bar" anymore.
3835	 *
3836	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3837	 * but it guarantees correctness at the expense of occasional full
3838	 * transaction commits on fsync if our inode is a directory, or if our
3839	 * inode is not a directory, logging its parent unnecessarily.
3840	 */
3841	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3842
3843	/*
3844	 * Same logic as for last_unlink_trans. We don't persist the generation
3845	 * of the last transaction where this inode was used for a reflink
3846	 * operation, so after eviction and reloading the inode we must be
3847	 * pessimistic and assume the last transaction that modified the inode.
3848	 */
3849	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3850
3851	path->slots[0]++;
3852	if (inode->i_nlink != 1 ||
3853	    path->slots[0] >= btrfs_header_nritems(leaf))
3854		goto cache_acl;
3855
3856	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3857	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3858		goto cache_acl;
3859
3860	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3861	if (location.type == BTRFS_INODE_REF_KEY) {
3862		struct btrfs_inode_ref *ref;
3863
3864		ref = (struct btrfs_inode_ref *)ptr;
3865		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3866	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3867		struct btrfs_inode_extref *extref;
3868
3869		extref = (struct btrfs_inode_extref *)ptr;
3870		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3871								     extref);
3872	}
3873cache_acl:
3874	/*
3875	 * try to precache a NULL acl entry for files that don't have
3876	 * any xattrs or acls
3877	 */
3878	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3879			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3880	if (first_xattr_slot != -1) {
3881		path->slots[0] = first_xattr_slot;
3882		ret = btrfs_load_inode_props(inode, path);
3883		if (ret)
3884			btrfs_err(fs_info,
3885				  "error loading props for ino %llu (root %llu): %d",
3886				  btrfs_ino(BTRFS_I(inode)),
3887				  root->root_key.objectid, ret);
3888	}
3889	if (path != in_path)
3890		btrfs_free_path(path);
3891
3892	if (!maybe_acls)
3893		cache_no_acl(inode);
3894
3895	switch (inode->i_mode & S_IFMT) {
3896	case S_IFREG:
3897		inode->i_mapping->a_ops = &btrfs_aops;
3898		inode->i_fop = &btrfs_file_operations;
3899		inode->i_op = &btrfs_file_inode_operations;
3900		break;
3901	case S_IFDIR:
3902		inode->i_fop = &btrfs_dir_file_operations;
3903		inode->i_op = &btrfs_dir_inode_operations;
3904		break;
3905	case S_IFLNK:
3906		inode->i_op = &btrfs_symlink_inode_operations;
3907		inode_nohighmem(inode);
3908		inode->i_mapping->a_ops = &btrfs_aops;
3909		break;
3910	default:
3911		inode->i_op = &btrfs_special_inode_operations;
3912		init_special_inode(inode, inode->i_mode, rdev);
3913		break;
3914	}
3915
3916	btrfs_sync_inode_flags_to_i_flags(inode);
3917	return 0;
3918}
3919
3920/*
3921 * given a leaf and an inode, copy the inode fields into the leaf
3922 */
3923static void fill_inode_item(struct btrfs_trans_handle *trans,
3924			    struct extent_buffer *leaf,
3925			    struct btrfs_inode_item *item,
3926			    struct inode *inode)
3927{
3928	struct btrfs_map_token token;
3929	u64 flags;
3930
3931	btrfs_init_map_token(&token, leaf);
3932
3933	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3934	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3935	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3936	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3937	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3938
3939	btrfs_set_token_timespec_sec(&token, &item->atime,
3940				     inode->i_atime.tv_sec);
3941	btrfs_set_token_timespec_nsec(&token, &item->atime,
3942				      inode->i_atime.tv_nsec);
3943
3944	btrfs_set_token_timespec_sec(&token, &item->mtime,
3945				     inode->i_mtime.tv_sec);
3946	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3947				      inode->i_mtime.tv_nsec);
3948
3949	btrfs_set_token_timespec_sec(&token, &item->ctime,
3950				     inode_get_ctime(inode).tv_sec);
3951	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3952				      inode_get_ctime(inode).tv_nsec);
3953
3954	btrfs_set_token_timespec_sec(&token, &item->otime,
3955				     BTRFS_I(inode)->i_otime.tv_sec);
3956	btrfs_set_token_timespec_nsec(&token, &item->otime,
3957				      BTRFS_I(inode)->i_otime.tv_nsec);
3958
3959	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3960	btrfs_set_token_inode_generation(&token, item,
3961					 BTRFS_I(inode)->generation);
3962	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3963	btrfs_set_token_inode_transid(&token, item, trans->transid);
3964	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3965	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3966					  BTRFS_I(inode)->ro_flags);
3967	btrfs_set_token_inode_flags(&token, item, flags);
3968	btrfs_set_token_inode_block_group(&token, item, 0);
3969}
3970
3971/*
3972 * copy everything in the in-memory inode into the btree.
3973 */
3974static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3975				struct btrfs_root *root,
3976				struct btrfs_inode *inode)
3977{
3978	struct btrfs_inode_item *inode_item;
3979	struct btrfs_path *path;
3980	struct extent_buffer *leaf;
3981	int ret;
3982
3983	path = btrfs_alloc_path();
3984	if (!path)
3985		return -ENOMEM;
3986
3987	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3988	if (ret) {
3989		if (ret > 0)
3990			ret = -ENOENT;
3991		goto failed;
3992	}
3993
3994	leaf = path->nodes[0];
3995	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3996				    struct btrfs_inode_item);
3997
3998	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
3999	btrfs_mark_buffer_dirty(trans, leaf);
4000	btrfs_set_inode_last_trans(trans, inode);
4001	ret = 0;
4002failed:
4003	btrfs_free_path(path);
4004	return ret;
4005}
4006
4007/*
4008 * copy everything in the in-memory inode into the btree.
4009 */
4010noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4011				struct btrfs_root *root,
4012				struct btrfs_inode *inode)
4013{
4014	struct btrfs_fs_info *fs_info = root->fs_info;
4015	int ret;
4016
4017	/*
4018	 * If the inode is a free space inode, we can deadlock during commit
4019	 * if we put it into the delayed code.
4020	 *
4021	 * The data relocation inode should also be directly updated
4022	 * without delay
4023	 */
4024	if (!btrfs_is_free_space_inode(inode)
4025	    && !btrfs_is_data_reloc_root(root)
4026	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4027		btrfs_update_root_times(trans, root);
4028
4029		ret = btrfs_delayed_update_inode(trans, root, inode);
4030		if (!ret)
4031			btrfs_set_inode_last_trans(trans, inode);
4032		return ret;
4033	}
4034
4035	return btrfs_update_inode_item(trans, root, inode);
4036}
4037
4038int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4039				struct btrfs_root *root, struct btrfs_inode *inode)
4040{
4041	int ret;
4042
4043	ret = btrfs_update_inode(trans, root, inode);
4044	if (ret == -ENOSPC)
4045		return btrfs_update_inode_item(trans, root, inode);
4046	return ret;
4047}
4048
4049/*
4050 * unlink helper that gets used here in inode.c and in the tree logging
4051 * recovery code.  It remove a link in a directory with a given name, and
4052 * also drops the back refs in the inode to the directory
4053 */
4054static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4055				struct btrfs_inode *dir,
4056				struct btrfs_inode *inode,
4057				const struct fscrypt_str *name,
4058				struct btrfs_rename_ctx *rename_ctx)
4059{
4060	struct btrfs_root *root = dir->root;
4061	struct btrfs_fs_info *fs_info = root->fs_info;
4062	struct btrfs_path *path;
4063	int ret = 0;
4064	struct btrfs_dir_item *di;
4065	u64 index;
4066	u64 ino = btrfs_ino(inode);
4067	u64 dir_ino = btrfs_ino(dir);
4068
4069	path = btrfs_alloc_path();
4070	if (!path) {
4071		ret = -ENOMEM;
4072		goto out;
4073	}
4074
4075	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4076	if (IS_ERR_OR_NULL(di)) {
4077		ret = di ? PTR_ERR(di) : -ENOENT;
4078		goto err;
4079	}
4080	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4081	if (ret)
4082		goto err;
4083	btrfs_release_path(path);
4084
4085	/*
4086	 * If we don't have dir index, we have to get it by looking up
4087	 * the inode ref, since we get the inode ref, remove it directly,
4088	 * it is unnecessary to do delayed deletion.
4089	 *
4090	 * But if we have dir index, needn't search inode ref to get it.
4091	 * Since the inode ref is close to the inode item, it is better
4092	 * that we delay to delete it, and just do this deletion when
4093	 * we update the inode item.
4094	 */
4095	if (inode->dir_index) {
4096		ret = btrfs_delayed_delete_inode_ref(inode);
4097		if (!ret) {
4098			index = inode->dir_index;
4099			goto skip_backref;
4100		}
4101	}
4102
4103	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4104	if (ret) {
4105		btrfs_info(fs_info,
4106			"failed to delete reference to %.*s, inode %llu parent %llu",
4107			name->len, name->name, ino, dir_ino);
4108		btrfs_abort_transaction(trans, ret);
4109		goto err;
4110	}
4111skip_backref:
4112	if (rename_ctx)
4113		rename_ctx->index = index;
4114
4115	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4116	if (ret) {
4117		btrfs_abort_transaction(trans, ret);
4118		goto err;
4119	}
4120
4121	/*
4122	 * If we are in a rename context, we don't need to update anything in the
4123	 * log. That will be done later during the rename by btrfs_log_new_name().
4124	 * Besides that, doing it here would only cause extra unnecessary btree
4125	 * operations on the log tree, increasing latency for applications.
4126	 */
4127	if (!rename_ctx) {
4128		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4129		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4130	}
4131
4132	/*
4133	 * If we have a pending delayed iput we could end up with the final iput
4134	 * being run in btrfs-cleaner context.  If we have enough of these built
4135	 * up we can end up burning a lot of time in btrfs-cleaner without any
4136	 * way to throttle the unlinks.  Since we're currently holding a ref on
4137	 * the inode we can run the delayed iput here without any issues as the
4138	 * final iput won't be done until after we drop the ref we're currently
4139	 * holding.
4140	 */
4141	btrfs_run_delayed_iput(fs_info, inode);
4142err:
4143	btrfs_free_path(path);
4144	if (ret)
4145		goto out;
4146
4147	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4148	inode_inc_iversion(&inode->vfs_inode);
4149	inode_inc_iversion(&dir->vfs_inode);
4150	inode_set_ctime_current(&inode->vfs_inode);
4151	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4152	ret = btrfs_update_inode(trans, root, dir);
4153out:
4154	return ret;
4155}
4156
4157int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4158		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4159		       const struct fscrypt_str *name)
4160{
4161	int ret;
4162
4163	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4164	if (!ret) {
4165		drop_nlink(&inode->vfs_inode);
4166		ret = btrfs_update_inode(trans, inode->root, inode);
4167	}
4168	return ret;
4169}
4170
4171/*
4172 * helper to start transaction for unlink and rmdir.
4173 *
4174 * unlink and rmdir are special in btrfs, they do not always free space, so
4175 * if we cannot make our reservations the normal way try and see if there is
4176 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4177 * allow the unlink to occur.
4178 */
4179static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4180{
4181	struct btrfs_root *root = dir->root;
4182
4183	return btrfs_start_transaction_fallback_global_rsv(root,
4184						   BTRFS_UNLINK_METADATA_UNITS);
4185}
4186
4187static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4188{
4189	struct btrfs_trans_handle *trans;
4190	struct inode *inode = d_inode(dentry);
4191	int ret;
4192	struct fscrypt_name fname;
4193
4194	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4195	if (ret)
4196		return ret;
4197
4198	/* This needs to handle no-key deletions later on */
4199
4200	trans = __unlink_start_trans(BTRFS_I(dir));
4201	if (IS_ERR(trans)) {
4202		ret = PTR_ERR(trans);
4203		goto fscrypt_free;
4204	}
4205
4206	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4207				false);
4208
4209	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4210				 &fname.disk_name);
4211	if (ret)
4212		goto end_trans;
4213
4214	if (inode->i_nlink == 0) {
4215		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4216		if (ret)
4217			goto end_trans;
4218	}
4219
4220end_trans:
4221	btrfs_end_transaction(trans);
4222	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4223fscrypt_free:
4224	fscrypt_free_filename(&fname);
4225	return ret;
4226}
4227
4228static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4229			       struct btrfs_inode *dir, struct dentry *dentry)
4230{
4231	struct btrfs_root *root = dir->root;
4232	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4233	struct btrfs_path *path;
4234	struct extent_buffer *leaf;
4235	struct btrfs_dir_item *di;
4236	struct btrfs_key key;
4237	u64 index;
4238	int ret;
4239	u64 objectid;
4240	u64 dir_ino = btrfs_ino(dir);
4241	struct fscrypt_name fname;
4242
4243	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4244	if (ret)
4245		return ret;
4246
4247	/* This needs to handle no-key deletions later on */
4248
4249	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4250		objectid = inode->root->root_key.objectid;
4251	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4252		objectid = inode->location.objectid;
4253	} else {
4254		WARN_ON(1);
4255		fscrypt_free_filename(&fname);
4256		return -EINVAL;
4257	}
4258
4259	path = btrfs_alloc_path();
4260	if (!path) {
4261		ret = -ENOMEM;
4262		goto out;
4263	}
4264
4265	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4266				   &fname.disk_name, -1);
4267	if (IS_ERR_OR_NULL(di)) {
4268		ret = di ? PTR_ERR(di) : -ENOENT;
4269		goto out;
4270	}
4271
4272	leaf = path->nodes[0];
4273	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4274	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4275	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4276	if (ret) {
4277		btrfs_abort_transaction(trans, ret);
4278		goto out;
4279	}
4280	btrfs_release_path(path);
4281
4282	/*
4283	 * This is a placeholder inode for a subvolume we didn't have a
4284	 * reference to at the time of the snapshot creation.  In the meantime
4285	 * we could have renamed the real subvol link into our snapshot, so
4286	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4287	 * Instead simply lookup the dir_index_item for this entry so we can
4288	 * remove it.  Otherwise we know we have a ref to the root and we can
4289	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4290	 */
4291	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4292		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4293		if (IS_ERR_OR_NULL(di)) {
4294			if (!di)
4295				ret = -ENOENT;
4296			else
4297				ret = PTR_ERR(di);
4298			btrfs_abort_transaction(trans, ret);
4299			goto out;
4300		}
4301
4302		leaf = path->nodes[0];
4303		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4304		index = key.offset;
4305		btrfs_release_path(path);
4306	} else {
4307		ret = btrfs_del_root_ref(trans, objectid,
4308					 root->root_key.objectid, dir_ino,
4309					 &index, &fname.disk_name);
4310		if (ret) {
4311			btrfs_abort_transaction(trans, ret);
4312			goto out;
4313		}
4314	}
4315
4316	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4317	if (ret) {
4318		btrfs_abort_transaction(trans, ret);
4319		goto out;
4320	}
4321
4322	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4323	inode_inc_iversion(&dir->vfs_inode);
4324	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4325	ret = btrfs_update_inode_fallback(trans, root, dir);
4326	if (ret)
4327		btrfs_abort_transaction(trans, ret);
4328out:
4329	btrfs_free_path(path);
4330	fscrypt_free_filename(&fname);
4331	return ret;
4332}
4333
4334/*
4335 * Helper to check if the subvolume references other subvolumes or if it's
4336 * default.
4337 */
4338static noinline int may_destroy_subvol(struct btrfs_root *root)
4339{
4340	struct btrfs_fs_info *fs_info = root->fs_info;
4341	struct btrfs_path *path;
4342	struct btrfs_dir_item *di;
4343	struct btrfs_key key;
4344	struct fscrypt_str name = FSTR_INIT("default", 7);
4345	u64 dir_id;
4346	int ret;
4347
4348	path = btrfs_alloc_path();
4349	if (!path)
4350		return -ENOMEM;
4351
4352	/* Make sure this root isn't set as the default subvol */
4353	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4354	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4355				   dir_id, &name, 0);
4356	if (di && !IS_ERR(di)) {
4357		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4358		if (key.objectid == root->root_key.objectid) {
4359			ret = -EPERM;
4360			btrfs_err(fs_info,
4361				  "deleting default subvolume %llu is not allowed",
4362				  key.objectid);
4363			goto out;
4364		}
4365		btrfs_release_path(path);
4366	}
4367
4368	key.objectid = root->root_key.objectid;
4369	key.type = BTRFS_ROOT_REF_KEY;
4370	key.offset = (u64)-1;
4371
4372	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4373	if (ret < 0)
4374		goto out;
4375	BUG_ON(ret == 0);
4376
4377	ret = 0;
4378	if (path->slots[0] > 0) {
4379		path->slots[0]--;
4380		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4381		if (key.objectid == root->root_key.objectid &&
4382		    key.type == BTRFS_ROOT_REF_KEY)
4383			ret = -ENOTEMPTY;
4384	}
4385out:
4386	btrfs_free_path(path);
4387	return ret;
4388}
4389
4390/* Delete all dentries for inodes belonging to the root */
4391static void btrfs_prune_dentries(struct btrfs_root *root)
4392{
4393	struct btrfs_fs_info *fs_info = root->fs_info;
4394	struct rb_node *node;
4395	struct rb_node *prev;
4396	struct btrfs_inode *entry;
4397	struct inode *inode;
4398	u64 objectid = 0;
4399
4400	if (!BTRFS_FS_ERROR(fs_info))
4401		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4402
4403	spin_lock(&root->inode_lock);
4404again:
4405	node = root->inode_tree.rb_node;
4406	prev = NULL;
4407	while (node) {
4408		prev = node;
4409		entry = rb_entry(node, struct btrfs_inode, rb_node);
4410
4411		if (objectid < btrfs_ino(entry))
4412			node = node->rb_left;
4413		else if (objectid > btrfs_ino(entry))
4414			node = node->rb_right;
4415		else
4416			break;
4417	}
4418	if (!node) {
4419		while (prev) {
4420			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4421			if (objectid <= btrfs_ino(entry)) {
4422				node = prev;
4423				break;
4424			}
4425			prev = rb_next(prev);
4426		}
4427	}
4428	while (node) {
4429		entry = rb_entry(node, struct btrfs_inode, rb_node);
4430		objectid = btrfs_ino(entry) + 1;
4431		inode = igrab(&entry->vfs_inode);
4432		if (inode) {
4433			spin_unlock(&root->inode_lock);
4434			if (atomic_read(&inode->i_count) > 1)
4435				d_prune_aliases(inode);
4436			/*
4437			 * btrfs_drop_inode will have it removed from the inode
4438			 * cache when its usage count hits zero.
4439			 */
4440			iput(inode);
4441			cond_resched();
4442			spin_lock(&root->inode_lock);
4443			goto again;
4444		}
4445
4446		if (cond_resched_lock(&root->inode_lock))
4447			goto again;
4448
4449		node = rb_next(node);
4450	}
4451	spin_unlock(&root->inode_lock);
4452}
4453
4454int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4455{
4456	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4457	struct btrfs_root *root = dir->root;
4458	struct inode *inode = d_inode(dentry);
4459	struct btrfs_root *dest = BTRFS_I(inode)->root;
4460	struct btrfs_trans_handle *trans;
4461	struct btrfs_block_rsv block_rsv;
4462	u64 root_flags;
4463	int ret;
4464
4465	down_write(&fs_info->subvol_sem);
4466
4467	/*
4468	 * Don't allow to delete a subvolume with send in progress. This is
4469	 * inside the inode lock so the error handling that has to drop the bit
4470	 * again is not run concurrently.
4471	 */
4472	spin_lock(&dest->root_item_lock);
4473	if (dest->send_in_progress) {
4474		spin_unlock(&dest->root_item_lock);
4475		btrfs_warn(fs_info,
4476			   "attempt to delete subvolume %llu during send",
4477			   dest->root_key.objectid);
4478		ret = -EPERM;
4479		goto out_up_write;
4480	}
4481	if (atomic_read(&dest->nr_swapfiles)) {
4482		spin_unlock(&dest->root_item_lock);
4483		btrfs_warn(fs_info,
4484			   "attempt to delete subvolume %llu with active swapfile",
4485			   root->root_key.objectid);
4486		ret = -EPERM;
4487		goto out_up_write;
4488	}
4489	root_flags = btrfs_root_flags(&dest->root_item);
4490	btrfs_set_root_flags(&dest->root_item,
4491			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4492	spin_unlock(&dest->root_item_lock);
4493
4494	ret = may_destroy_subvol(dest);
4495	if (ret)
4496		goto out_undead;
4497
4498	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4499	/*
4500	 * One for dir inode,
4501	 * two for dir entries,
4502	 * two for root ref/backref.
4503	 */
4504	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4505	if (ret)
4506		goto out_undead;
4507
4508	trans = btrfs_start_transaction(root, 0);
4509	if (IS_ERR(trans)) {
4510		ret = PTR_ERR(trans);
4511		goto out_release;
4512	}
4513	trans->block_rsv = &block_rsv;
4514	trans->bytes_reserved = block_rsv.size;
4515
4516	btrfs_record_snapshot_destroy(trans, dir);
4517
4518	ret = btrfs_unlink_subvol(trans, dir, dentry);
4519	if (ret) {
4520		btrfs_abort_transaction(trans, ret);
4521		goto out_end_trans;
4522	}
4523
4524	ret = btrfs_record_root_in_trans(trans, dest);
4525	if (ret) {
4526		btrfs_abort_transaction(trans, ret);
4527		goto out_end_trans;
4528	}
4529
4530	memset(&dest->root_item.drop_progress, 0,
4531		sizeof(dest->root_item.drop_progress));
4532	btrfs_set_root_drop_level(&dest->root_item, 0);
4533	btrfs_set_root_refs(&dest->root_item, 0);
4534
4535	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4536		ret = btrfs_insert_orphan_item(trans,
4537					fs_info->tree_root,
4538					dest->root_key.objectid);
4539		if (ret) {
4540			btrfs_abort_transaction(trans, ret);
4541			goto out_end_trans;
4542		}
4543	}
4544
4545	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4546				  BTRFS_UUID_KEY_SUBVOL,
4547				  dest->root_key.objectid);
4548	if (ret && ret != -ENOENT) {
4549		btrfs_abort_transaction(trans, ret);
4550		goto out_end_trans;
4551	}
4552	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4553		ret = btrfs_uuid_tree_remove(trans,
4554					  dest->root_item.received_uuid,
4555					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4556					  dest->root_key.objectid);
4557		if (ret && ret != -ENOENT) {
4558			btrfs_abort_transaction(trans, ret);
4559			goto out_end_trans;
4560		}
4561	}
4562
4563	free_anon_bdev(dest->anon_dev);
4564	dest->anon_dev = 0;
4565out_end_trans:
4566	trans->block_rsv = NULL;
4567	trans->bytes_reserved = 0;
4568	ret = btrfs_end_transaction(trans);
4569	inode->i_flags |= S_DEAD;
4570out_release:
4571	btrfs_subvolume_release_metadata(root, &block_rsv);
4572out_undead:
4573	if (ret) {
4574		spin_lock(&dest->root_item_lock);
4575		root_flags = btrfs_root_flags(&dest->root_item);
4576		btrfs_set_root_flags(&dest->root_item,
4577				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4578		spin_unlock(&dest->root_item_lock);
4579	}
4580out_up_write:
4581	up_write(&fs_info->subvol_sem);
4582	if (!ret) {
4583		d_invalidate(dentry);
4584		btrfs_prune_dentries(dest);
4585		ASSERT(dest->send_in_progress == 0);
4586	}
4587
4588	return ret;
4589}
4590
4591static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4592{
4593	struct inode *inode = d_inode(dentry);
4594	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4595	int err = 0;
4596	struct btrfs_trans_handle *trans;
4597	u64 last_unlink_trans;
4598	struct fscrypt_name fname;
4599
4600	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4601		return -ENOTEMPTY;
4602	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4603		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4604			btrfs_err(fs_info,
4605			"extent tree v2 doesn't support snapshot deletion yet");
4606			return -EOPNOTSUPP;
4607		}
4608		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4609	}
4610
4611	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4612	if (err)
4613		return err;
4614
4615	/* This needs to handle no-key deletions later on */
4616
4617	trans = __unlink_start_trans(BTRFS_I(dir));
4618	if (IS_ERR(trans)) {
4619		err = PTR_ERR(trans);
4620		goto out_notrans;
4621	}
4622
4623	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4624		err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4625		goto out;
4626	}
4627
4628	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4629	if (err)
4630		goto out;
4631
4632	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4633
4634	/* now the directory is empty */
4635	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4636				 &fname.disk_name);
4637	if (!err) {
4638		btrfs_i_size_write(BTRFS_I(inode), 0);
4639		/*
4640		 * Propagate the last_unlink_trans value of the deleted dir to
4641		 * its parent directory. This is to prevent an unrecoverable
4642		 * log tree in the case we do something like this:
4643		 * 1) create dir foo
4644		 * 2) create snapshot under dir foo
4645		 * 3) delete the snapshot
4646		 * 4) rmdir foo
4647		 * 5) mkdir foo
4648		 * 6) fsync foo or some file inside foo
4649		 */
4650		if (last_unlink_trans >= trans->transid)
4651			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4652	}
4653out:
4654	btrfs_end_transaction(trans);
4655out_notrans:
4656	btrfs_btree_balance_dirty(fs_info);
4657	fscrypt_free_filename(&fname);
4658
4659	return err;
4660}
4661
4662/*
4663 * btrfs_truncate_block - read, zero a chunk and write a block
4664 * @inode - inode that we're zeroing
4665 * @from - the offset to start zeroing
4666 * @len - the length to zero, 0 to zero the entire range respective to the
4667 *	offset
4668 * @front - zero up to the offset instead of from the offset on
4669 *
4670 * This will find the block for the "from" offset and cow the block and zero the
4671 * part we want to zero.  This is used with truncate and hole punching.
4672 */
4673int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4674			 int front)
4675{
4676	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4677	struct address_space *mapping = inode->vfs_inode.i_mapping;
4678	struct extent_io_tree *io_tree = &inode->io_tree;
4679	struct btrfs_ordered_extent *ordered;
4680	struct extent_state *cached_state = NULL;
4681	struct extent_changeset *data_reserved = NULL;
4682	bool only_release_metadata = false;
4683	u32 blocksize = fs_info->sectorsize;
4684	pgoff_t index = from >> PAGE_SHIFT;
4685	unsigned offset = from & (blocksize - 1);
4686	struct page *page;
4687	gfp_t mask = btrfs_alloc_write_mask(mapping);
4688	size_t write_bytes = blocksize;
4689	int ret = 0;
4690	u64 block_start;
4691	u64 block_end;
4692
4693	if (IS_ALIGNED(offset, blocksize) &&
4694	    (!len || IS_ALIGNED(len, blocksize)))
4695		goto out;
4696
4697	block_start = round_down(from, blocksize);
4698	block_end = block_start + blocksize - 1;
4699
4700	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4701					  blocksize, false);
4702	if (ret < 0) {
4703		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4704			/* For nocow case, no need to reserve data space */
4705			only_release_metadata = true;
4706		} else {
4707			goto out;
4708		}
4709	}
4710	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4711	if (ret < 0) {
4712		if (!only_release_metadata)
4713			btrfs_free_reserved_data_space(inode, data_reserved,
4714						       block_start, blocksize);
4715		goto out;
4716	}
4717again:
4718	page = find_or_create_page(mapping, index, mask);
4719	if (!page) {
4720		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4721					     blocksize, true);
4722		btrfs_delalloc_release_extents(inode, blocksize);
4723		ret = -ENOMEM;
4724		goto out;
4725	}
4726
4727	if (!PageUptodate(page)) {
4728		ret = btrfs_read_folio(NULL, page_folio(page));
4729		lock_page(page);
4730		if (page->mapping != mapping) {
4731			unlock_page(page);
4732			put_page(page);
4733			goto again;
4734		}
4735		if (!PageUptodate(page)) {
4736			ret = -EIO;
4737			goto out_unlock;
4738		}
4739	}
4740
4741	/*
4742	 * We unlock the page after the io is completed and then re-lock it
4743	 * above.  release_folio() could have come in between that and cleared
4744	 * PagePrivate(), but left the page in the mapping.  Set the page mapped
4745	 * here to make sure it's properly set for the subpage stuff.
4746	 */
4747	ret = set_page_extent_mapped(page);
4748	if (ret < 0)
4749		goto out_unlock;
4750
4751	wait_on_page_writeback(page);
4752
4753	lock_extent(io_tree, block_start, block_end, &cached_state);
4754
4755	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4756	if (ordered) {
4757		unlock_extent(io_tree, block_start, block_end, &cached_state);
4758		unlock_page(page);
4759		put_page(page);
4760		btrfs_start_ordered_extent(ordered);
4761		btrfs_put_ordered_extent(ordered);
4762		goto again;
4763	}
4764
4765	clear_extent_bit(&inode->io_tree, block_start, block_end,
4766			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4767			 &cached_state);
4768
4769	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4770					&cached_state);
4771	if (ret) {
4772		unlock_extent(io_tree, block_start, block_end, &cached_state);
4773		goto out_unlock;
4774	}
4775
4776	if (offset != blocksize) {
4777		if (!len)
4778			len = blocksize - offset;
4779		if (front)
4780			memzero_page(page, (block_start - page_offset(page)),
4781				     offset);
4782		else
4783			memzero_page(page, (block_start - page_offset(page)) + offset,
4784				     len);
4785	}
4786	btrfs_page_clear_checked(fs_info, page, block_start,
4787				 block_end + 1 - block_start);
4788	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4789	unlock_extent(io_tree, block_start, block_end, &cached_state);
4790
4791	if (only_release_metadata)
4792		set_extent_bit(&inode->io_tree, block_start, block_end,
4793			       EXTENT_NORESERVE, NULL);
4794
4795out_unlock:
4796	if (ret) {
4797		if (only_release_metadata)
4798			btrfs_delalloc_release_metadata(inode, blocksize, true);
4799		else
4800			btrfs_delalloc_release_space(inode, data_reserved,
4801					block_start, blocksize, true);
4802	}
4803	btrfs_delalloc_release_extents(inode, blocksize);
4804	unlock_page(page);
4805	put_page(page);
4806out:
4807	if (only_release_metadata)
4808		btrfs_check_nocow_unlock(inode);
4809	extent_changeset_free(data_reserved);
4810	return ret;
4811}
4812
4813static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4814			     u64 offset, u64 len)
4815{
4816	struct btrfs_fs_info *fs_info = root->fs_info;
4817	struct btrfs_trans_handle *trans;
4818	struct btrfs_drop_extents_args drop_args = { 0 };
4819	int ret;
4820
4821	/*
4822	 * If NO_HOLES is enabled, we don't need to do anything.
4823	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4824	 * or btrfs_update_inode() will be called, which guarantee that the next
4825	 * fsync will know this inode was changed and needs to be logged.
4826	 */
4827	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4828		return 0;
4829
4830	/*
4831	 * 1 - for the one we're dropping
4832	 * 1 - for the one we're adding
4833	 * 1 - for updating the inode.
4834	 */
4835	trans = btrfs_start_transaction(root, 3);
4836	if (IS_ERR(trans))
4837		return PTR_ERR(trans);
4838
4839	drop_args.start = offset;
4840	drop_args.end = offset + len;
4841	drop_args.drop_cache = true;
4842
4843	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4844	if (ret) {
4845		btrfs_abort_transaction(trans, ret);
4846		btrfs_end_transaction(trans);
4847		return ret;
4848	}
4849
4850	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4851	if (ret) {
4852		btrfs_abort_transaction(trans, ret);
4853	} else {
4854		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4855		btrfs_update_inode(trans, root, inode);
4856	}
4857	btrfs_end_transaction(trans);
4858	return ret;
4859}
4860
4861/*
4862 * This function puts in dummy file extents for the area we're creating a hole
4863 * for.  So if we are truncating this file to a larger size we need to insert
4864 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4865 * the range between oldsize and size
4866 */
4867int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4868{
4869	struct btrfs_root *root = inode->root;
4870	struct btrfs_fs_info *fs_info = root->fs_info;
4871	struct extent_io_tree *io_tree = &inode->io_tree;
4872	struct extent_map *em = NULL;
4873	struct extent_state *cached_state = NULL;
4874	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4875	u64 block_end = ALIGN(size, fs_info->sectorsize);
4876	u64 last_byte;
4877	u64 cur_offset;
4878	u64 hole_size;
4879	int err = 0;
4880
4881	/*
4882	 * If our size started in the middle of a block we need to zero out the
4883	 * rest of the block before we expand the i_size, otherwise we could
4884	 * expose stale data.
4885	 */
4886	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4887	if (err)
4888		return err;
4889
4890	if (size <= hole_start)
4891		return 0;
4892
4893	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4894					   &cached_state);
4895	cur_offset = hole_start;
4896	while (1) {
4897		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4898				      block_end - cur_offset);
4899		if (IS_ERR(em)) {
4900			err = PTR_ERR(em);
4901			em = NULL;
4902			break;
4903		}
4904		last_byte = min(extent_map_end(em), block_end);
4905		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4906		hole_size = last_byte - cur_offset;
4907
4908		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4909			struct extent_map *hole_em;
4910
4911			err = maybe_insert_hole(root, inode, cur_offset,
4912						hole_size);
4913			if (err)
4914				break;
4915
4916			err = btrfs_inode_set_file_extent_range(inode,
4917							cur_offset, hole_size);
4918			if (err)
4919				break;
4920
4921			hole_em = alloc_extent_map();
4922			if (!hole_em) {
4923				btrfs_drop_extent_map_range(inode, cur_offset,
4924						    cur_offset + hole_size - 1,
4925						    false);
4926				btrfs_set_inode_full_sync(inode);
4927				goto next;
4928			}
4929			hole_em->start = cur_offset;
4930			hole_em->len = hole_size;
4931			hole_em->orig_start = cur_offset;
4932
4933			hole_em->block_start = EXTENT_MAP_HOLE;
4934			hole_em->block_len = 0;
4935			hole_em->orig_block_len = 0;
4936			hole_em->ram_bytes = hole_size;
4937			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4938			hole_em->generation = fs_info->generation;
4939
4940			err = btrfs_replace_extent_map_range(inode, hole_em, true);
4941			free_extent_map(hole_em);
4942		} else {
4943			err = btrfs_inode_set_file_extent_range(inode,
4944							cur_offset, hole_size);
4945			if (err)
4946				break;
4947		}
4948next:
4949		free_extent_map(em);
4950		em = NULL;
4951		cur_offset = last_byte;
4952		if (cur_offset >= block_end)
4953			break;
4954	}
4955	free_extent_map(em);
4956	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
4957	return err;
4958}
4959
4960static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4961{
4962	struct btrfs_root *root = BTRFS_I(inode)->root;
4963	struct btrfs_trans_handle *trans;
4964	loff_t oldsize = i_size_read(inode);
4965	loff_t newsize = attr->ia_size;
4966	int mask = attr->ia_valid;
4967	int ret;
4968
4969	/*
4970	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4971	 * special case where we need to update the times despite not having
4972	 * these flags set.  For all other operations the VFS set these flags
4973	 * explicitly if it wants a timestamp update.
4974	 */
4975	if (newsize != oldsize) {
4976		inode_inc_iversion(inode);
4977		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
4978			inode->i_mtime = inode_set_ctime_current(inode);
4979		}
4980	}
4981
4982	if (newsize > oldsize) {
4983		/*
4984		 * Don't do an expanding truncate while snapshotting is ongoing.
4985		 * This is to ensure the snapshot captures a fully consistent
4986		 * state of this file - if the snapshot captures this expanding
4987		 * truncation, it must capture all writes that happened before
4988		 * this truncation.
4989		 */
4990		btrfs_drew_write_lock(&root->snapshot_lock);
4991		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
4992		if (ret) {
4993			btrfs_drew_write_unlock(&root->snapshot_lock);
4994			return ret;
4995		}
4996
4997		trans = btrfs_start_transaction(root, 1);
4998		if (IS_ERR(trans)) {
4999			btrfs_drew_write_unlock(&root->snapshot_lock);
5000			return PTR_ERR(trans);
5001		}
5002
5003		i_size_write(inode, newsize);
5004		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5005		pagecache_isize_extended(inode, oldsize, newsize);
5006		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5007		btrfs_drew_write_unlock(&root->snapshot_lock);
5008		btrfs_end_transaction(trans);
5009	} else {
5010		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5011
5012		if (btrfs_is_zoned(fs_info)) {
5013			ret = btrfs_wait_ordered_range(inode,
5014					ALIGN(newsize, fs_info->sectorsize),
5015					(u64)-1);
5016			if (ret)
5017				return ret;
5018		}
5019
5020		/*
5021		 * We're truncating a file that used to have good data down to
5022		 * zero. Make sure any new writes to the file get on disk
5023		 * on close.
5024		 */
5025		if (newsize == 0)
5026			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5027				&BTRFS_I(inode)->runtime_flags);
5028
5029		truncate_setsize(inode, newsize);
5030
5031		inode_dio_wait(inode);
5032
5033		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5034		if (ret && inode->i_nlink) {
5035			int err;
5036
5037			/*
5038			 * Truncate failed, so fix up the in-memory size. We
5039			 * adjusted disk_i_size down as we removed extents, so
5040			 * wait for disk_i_size to be stable and then update the
5041			 * in-memory size to match.
5042			 */
5043			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5044			if (err)
5045				return err;
5046			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5047		}
5048	}
5049
5050	return ret;
5051}
5052
5053static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5054			 struct iattr *attr)
5055{
5056	struct inode *inode = d_inode(dentry);
5057	struct btrfs_root *root = BTRFS_I(inode)->root;
5058	int err;
5059
5060	if (btrfs_root_readonly(root))
5061		return -EROFS;
5062
5063	err = setattr_prepare(idmap, dentry, attr);
5064	if (err)
5065		return err;
5066
5067	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5068		err = btrfs_setsize(inode, attr);
5069		if (err)
5070			return err;
5071	}
5072
5073	if (attr->ia_valid) {
5074		setattr_copy(idmap, inode, attr);
5075		inode_inc_iversion(inode);
5076		err = btrfs_dirty_inode(BTRFS_I(inode));
5077
5078		if (!err && attr->ia_valid & ATTR_MODE)
5079			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5080	}
5081
5082	return err;
5083}
5084
5085/*
5086 * While truncating the inode pages during eviction, we get the VFS
5087 * calling btrfs_invalidate_folio() against each folio of the inode. This
5088 * is slow because the calls to btrfs_invalidate_folio() result in a
5089 * huge amount of calls to lock_extent() and clear_extent_bit(),
5090 * which keep merging and splitting extent_state structures over and over,
5091 * wasting lots of time.
5092 *
5093 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5094 * skip all those expensive operations on a per folio basis and do only
5095 * the ordered io finishing, while we release here the extent_map and
5096 * extent_state structures, without the excessive merging and splitting.
5097 */
5098static void evict_inode_truncate_pages(struct inode *inode)
5099{
5100	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5101	struct rb_node *node;
5102
5103	ASSERT(inode->i_state & I_FREEING);
5104	truncate_inode_pages_final(&inode->i_data);
5105
5106	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5107
5108	/*
5109	 * Keep looping until we have no more ranges in the io tree.
5110	 * We can have ongoing bios started by readahead that have
5111	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5112	 * still in progress (unlocked the pages in the bio but did not yet
5113	 * unlocked the ranges in the io tree). Therefore this means some
5114	 * ranges can still be locked and eviction started because before
5115	 * submitting those bios, which are executed by a separate task (work
5116	 * queue kthread), inode references (inode->i_count) were not taken
5117	 * (which would be dropped in the end io callback of each bio).
5118	 * Therefore here we effectively end up waiting for those bios and
5119	 * anyone else holding locked ranges without having bumped the inode's
5120	 * reference count - if we don't do it, when they access the inode's
5121	 * io_tree to unlock a range it may be too late, leading to an
5122	 * use-after-free issue.
5123	 */
5124	spin_lock(&io_tree->lock);
5125	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5126		struct extent_state *state;
5127		struct extent_state *cached_state = NULL;
5128		u64 start;
5129		u64 end;
5130		unsigned state_flags;
5131
5132		node = rb_first(&io_tree->state);
5133		state = rb_entry(node, struct extent_state, rb_node);
5134		start = state->start;
5135		end = state->end;
5136		state_flags = state->state;
5137		spin_unlock(&io_tree->lock);
5138
5139		lock_extent(io_tree, start, end, &cached_state);
5140
5141		/*
5142		 * If still has DELALLOC flag, the extent didn't reach disk,
5143		 * and its reserved space won't be freed by delayed_ref.
5144		 * So we need to free its reserved space here.
5145		 * (Refer to comment in btrfs_invalidate_folio, case 2)
5146		 *
5147		 * Note, end is the bytenr of last byte, so we need + 1 here.
5148		 */
5149		if (state_flags & EXTENT_DELALLOC)
5150			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5151					       end - start + 1, NULL);
5152
5153		clear_extent_bit(io_tree, start, end,
5154				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5155				 &cached_state);
5156
5157		cond_resched();
5158		spin_lock(&io_tree->lock);
5159	}
5160	spin_unlock(&io_tree->lock);
5161}
5162
5163static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5164							struct btrfs_block_rsv *rsv)
5165{
5166	struct btrfs_fs_info *fs_info = root->fs_info;
5167	struct btrfs_trans_handle *trans;
5168	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5169	int ret;
5170
5171	/*
5172	 * Eviction should be taking place at some place safe because of our
5173	 * delayed iputs.  However the normal flushing code will run delayed
5174	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5175	 *
5176	 * We reserve the delayed_refs_extra here again because we can't use
5177	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5178	 * above.  We reserve our extra bit here because we generate a ton of
5179	 * delayed refs activity by truncating.
5180	 *
5181	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5182	 * if we fail to make this reservation we can re-try without the
5183	 * delayed_refs_extra so we can make some forward progress.
5184	 */
5185	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5186				     BTRFS_RESERVE_FLUSH_EVICT);
5187	if (ret) {
5188		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5189					     BTRFS_RESERVE_FLUSH_EVICT);
5190		if (ret) {
5191			btrfs_warn(fs_info,
5192				   "could not allocate space for delete; will truncate on mount");
5193			return ERR_PTR(-ENOSPC);
5194		}
5195		delayed_refs_extra = 0;
5196	}
5197
5198	trans = btrfs_join_transaction(root);
5199	if (IS_ERR(trans))
5200		return trans;
5201
5202	if (delayed_refs_extra) {
5203		trans->block_rsv = &fs_info->trans_block_rsv;
5204		trans->bytes_reserved = delayed_refs_extra;
5205		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5206					delayed_refs_extra, true);
5207	}
5208	return trans;
5209}
5210
5211void btrfs_evict_inode(struct inode *inode)
5212{
5213	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5214	struct btrfs_trans_handle *trans;
5215	struct btrfs_root *root = BTRFS_I(inode)->root;
5216	struct btrfs_block_rsv *rsv = NULL;
5217	int ret;
5218
5219	trace_btrfs_inode_evict(inode);
5220
5221	if (!root) {
5222		fsverity_cleanup_inode(inode);
5223		clear_inode(inode);
5224		return;
5225	}
5226
5227	evict_inode_truncate_pages(inode);
5228
5229	if (inode->i_nlink &&
5230	    ((btrfs_root_refs(&root->root_item) != 0 &&
5231	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5232	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5233		goto out;
5234
5235	if (is_bad_inode(inode))
5236		goto out;
5237
5238	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5239		goto out;
5240
5241	if (inode->i_nlink > 0) {
5242		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5243		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5244		goto out;
5245	}
5246
5247	/*
5248	 * This makes sure the inode item in tree is uptodate and the space for
5249	 * the inode update is released.
5250	 */
5251	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5252	if (ret)
5253		goto out;
5254
5255	/*
5256	 * This drops any pending insert or delete operations we have for this
5257	 * inode.  We could have a delayed dir index deletion queued up, but
5258	 * we're removing the inode completely so that'll be taken care of in
5259	 * the truncate.
5260	 */
5261	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5262
5263	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5264	if (!rsv)
5265		goto out;
5266	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5267	rsv->failfast = true;
5268
5269	btrfs_i_size_write(BTRFS_I(inode), 0);
5270
5271	while (1) {
5272		struct btrfs_truncate_control control = {
5273			.inode = BTRFS_I(inode),
5274			.ino = btrfs_ino(BTRFS_I(inode)),
5275			.new_size = 0,
5276			.min_type = 0,
5277		};
5278
5279		trans = evict_refill_and_join(root, rsv);
5280		if (IS_ERR(trans))
5281			goto out;
5282
5283		trans->block_rsv = rsv;
5284
5285		ret = btrfs_truncate_inode_items(trans, root, &control);
5286		trans->block_rsv = &fs_info->trans_block_rsv;
5287		btrfs_end_transaction(trans);
5288		/*
5289		 * We have not added new delayed items for our inode after we
5290		 * have flushed its delayed items, so no need to throttle on
5291		 * delayed items. However we have modified extent buffers.
5292		 */
5293		btrfs_btree_balance_dirty_nodelay(fs_info);
5294		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5295			goto out;
5296		else if (!ret)
5297			break;
5298	}
5299
5300	/*
5301	 * Errors here aren't a big deal, it just means we leave orphan items in
5302	 * the tree. They will be cleaned up on the next mount. If the inode
5303	 * number gets reused, cleanup deletes the orphan item without doing
5304	 * anything, and unlink reuses the existing orphan item.
5305	 *
5306	 * If it turns out that we are dropping too many of these, we might want
5307	 * to add a mechanism for retrying these after a commit.
5308	 */
5309	trans = evict_refill_and_join(root, rsv);
5310	if (!IS_ERR(trans)) {
5311		trans->block_rsv = rsv;
5312		btrfs_orphan_del(trans, BTRFS_I(inode));
5313		trans->block_rsv = &fs_info->trans_block_rsv;
5314		btrfs_end_transaction(trans);
5315	}
5316
5317out:
5318	btrfs_free_block_rsv(fs_info, rsv);
5319	/*
5320	 * If we didn't successfully delete, the orphan item will still be in
5321	 * the tree and we'll retry on the next mount. Again, we might also want
5322	 * to retry these periodically in the future.
5323	 */
5324	btrfs_remove_delayed_node(BTRFS_I(inode));
5325	fsverity_cleanup_inode(inode);
5326	clear_inode(inode);
5327}
5328
5329/*
5330 * Return the key found in the dir entry in the location pointer, fill @type
5331 * with BTRFS_FT_*, and return 0.
5332 *
5333 * If no dir entries were found, returns -ENOENT.
5334 * If found a corrupted location in dir entry, returns -EUCLEAN.
5335 */
5336static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5337			       struct btrfs_key *location, u8 *type)
5338{
5339	struct btrfs_dir_item *di;
5340	struct btrfs_path *path;
5341	struct btrfs_root *root = dir->root;
5342	int ret = 0;
5343	struct fscrypt_name fname;
5344
5345	path = btrfs_alloc_path();
5346	if (!path)
5347		return -ENOMEM;
5348
5349	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5350	if (ret < 0)
5351		goto out;
5352	/*
5353	 * fscrypt_setup_filename() should never return a positive value, but
5354	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5355	 */
5356	ASSERT(ret == 0);
5357
5358	/* This needs to handle no-key deletions later on */
5359
5360	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5361				   &fname.disk_name, 0);
5362	if (IS_ERR_OR_NULL(di)) {
5363		ret = di ? PTR_ERR(di) : -ENOENT;
5364		goto out;
5365	}
5366
5367	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5368	if (location->type != BTRFS_INODE_ITEM_KEY &&
5369	    location->type != BTRFS_ROOT_ITEM_KEY) {
5370		ret = -EUCLEAN;
5371		btrfs_warn(root->fs_info,
5372"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5373			   __func__, fname.disk_name.name, btrfs_ino(dir),
5374			   location->objectid, location->type, location->offset);
5375	}
5376	if (!ret)
5377		*type = btrfs_dir_ftype(path->nodes[0], di);
5378out:
5379	fscrypt_free_filename(&fname);
5380	btrfs_free_path(path);
5381	return ret;
5382}
5383
5384/*
5385 * when we hit a tree root in a directory, the btrfs part of the inode
5386 * needs to be changed to reflect the root directory of the tree root.  This
5387 * is kind of like crossing a mount point.
5388 */
5389static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5390				    struct btrfs_inode *dir,
5391				    struct dentry *dentry,
5392				    struct btrfs_key *location,
5393				    struct btrfs_root **sub_root)
5394{
5395	struct btrfs_path *path;
5396	struct btrfs_root *new_root;
5397	struct btrfs_root_ref *ref;
5398	struct extent_buffer *leaf;
5399	struct btrfs_key key;
5400	int ret;
5401	int err = 0;
5402	struct fscrypt_name fname;
5403
5404	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5405	if (ret)
5406		return ret;
5407
5408	path = btrfs_alloc_path();
5409	if (!path) {
5410		err = -ENOMEM;
5411		goto out;
5412	}
5413
5414	err = -ENOENT;
5415	key.objectid = dir->root->root_key.objectid;
5416	key.type = BTRFS_ROOT_REF_KEY;
5417	key.offset = location->objectid;
5418
5419	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5420	if (ret) {
5421		if (ret < 0)
5422			err = ret;
5423		goto out;
5424	}
5425
5426	leaf = path->nodes[0];
5427	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5428	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5429	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5430		goto out;
5431
5432	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5433				   (unsigned long)(ref + 1), fname.disk_name.len);
5434	if (ret)
5435		goto out;
5436
5437	btrfs_release_path(path);
5438
5439	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5440	if (IS_ERR(new_root)) {
5441		err = PTR_ERR(new_root);
5442		goto out;
5443	}
5444
5445	*sub_root = new_root;
5446	location->objectid = btrfs_root_dirid(&new_root->root_item);
5447	location->type = BTRFS_INODE_ITEM_KEY;
5448	location->offset = 0;
5449	err = 0;
5450out:
5451	btrfs_free_path(path);
5452	fscrypt_free_filename(&fname);
5453	return err;
5454}
5455
5456static void inode_tree_add(struct btrfs_inode *inode)
5457{
5458	struct btrfs_root *root = inode->root;
5459	struct btrfs_inode *entry;
5460	struct rb_node **p;
5461	struct rb_node *parent;
5462	struct rb_node *new = &inode->rb_node;
5463	u64 ino = btrfs_ino(inode);
5464
5465	if (inode_unhashed(&inode->vfs_inode))
5466		return;
5467	parent = NULL;
5468	spin_lock(&root->inode_lock);
5469	p = &root->inode_tree.rb_node;
5470	while (*p) {
5471		parent = *p;
5472		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5473
5474		if (ino < btrfs_ino(entry))
5475			p = &parent->rb_left;
5476		else if (ino > btrfs_ino(entry))
5477			p = &parent->rb_right;
5478		else {
5479			WARN_ON(!(entry->vfs_inode.i_state &
5480				  (I_WILL_FREE | I_FREEING)));
5481			rb_replace_node(parent, new, &root->inode_tree);
5482			RB_CLEAR_NODE(parent);
5483			spin_unlock(&root->inode_lock);
5484			return;
5485		}
5486	}
5487	rb_link_node(new, parent, p);
5488	rb_insert_color(new, &root->inode_tree);
5489	spin_unlock(&root->inode_lock);
5490}
5491
5492static void inode_tree_del(struct btrfs_inode *inode)
5493{
5494	struct btrfs_root *root = inode->root;
5495	int empty = 0;
5496
5497	spin_lock(&root->inode_lock);
5498	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5499		rb_erase(&inode->rb_node, &root->inode_tree);
5500		RB_CLEAR_NODE(&inode->rb_node);
5501		empty = RB_EMPTY_ROOT(&root->inode_tree);
5502	}
5503	spin_unlock(&root->inode_lock);
5504
5505	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5506		spin_lock(&root->inode_lock);
5507		empty = RB_EMPTY_ROOT(&root->inode_tree);
5508		spin_unlock(&root->inode_lock);
5509		if (empty)
5510			btrfs_add_dead_root(root);
5511	}
5512}
5513
5514
5515static int btrfs_init_locked_inode(struct inode *inode, void *p)
5516{
5517	struct btrfs_iget_args *args = p;
5518
5519	inode->i_ino = args->ino;
5520	BTRFS_I(inode)->location.objectid = args->ino;
5521	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5522	BTRFS_I(inode)->location.offset = 0;
5523	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5524	BUG_ON(args->root && !BTRFS_I(inode)->root);
5525
5526	if (args->root && args->root == args->root->fs_info->tree_root &&
5527	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
5528		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5529			&BTRFS_I(inode)->runtime_flags);
5530	return 0;
5531}
5532
5533static int btrfs_find_actor(struct inode *inode, void *opaque)
5534{
5535	struct btrfs_iget_args *args = opaque;
5536
5537	return args->ino == BTRFS_I(inode)->location.objectid &&
5538		args->root == BTRFS_I(inode)->root;
5539}
5540
5541static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5542				       struct btrfs_root *root)
5543{
5544	struct inode *inode;
5545	struct btrfs_iget_args args;
5546	unsigned long hashval = btrfs_inode_hash(ino, root);
5547
5548	args.ino = ino;
5549	args.root = root;
5550
5551	inode = iget5_locked(s, hashval, btrfs_find_actor,
5552			     btrfs_init_locked_inode,
5553			     (void *)&args);
5554	return inode;
5555}
5556
5557/*
5558 * Get an inode object given its inode number and corresponding root.
5559 * Path can be preallocated to prevent recursing back to iget through
5560 * allocator. NULL is also valid but may require an additional allocation
5561 * later.
5562 */
5563struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5564			      struct btrfs_root *root, struct btrfs_path *path)
5565{
5566	struct inode *inode;
5567
5568	inode = btrfs_iget_locked(s, ino, root);
5569	if (!inode)
5570		return ERR_PTR(-ENOMEM);
5571
5572	if (inode->i_state & I_NEW) {
5573		int ret;
5574
5575		ret = btrfs_read_locked_inode(inode, path);
5576		if (!ret) {
5577			inode_tree_add(BTRFS_I(inode));
5578			unlock_new_inode(inode);
5579		} else {
5580			iget_failed(inode);
5581			/*
5582			 * ret > 0 can come from btrfs_search_slot called by
5583			 * btrfs_read_locked_inode, this means the inode item
5584			 * was not found.
5585			 */
5586			if (ret > 0)
5587				ret = -ENOENT;
5588			inode = ERR_PTR(ret);
5589		}
5590	}
5591
5592	return inode;
5593}
5594
5595struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5596{
5597	return btrfs_iget_path(s, ino, root, NULL);
5598}
5599
5600static struct inode *new_simple_dir(struct inode *dir,
5601				    struct btrfs_key *key,
5602				    struct btrfs_root *root)
5603{
5604	struct inode *inode = new_inode(dir->i_sb);
5605
5606	if (!inode)
5607		return ERR_PTR(-ENOMEM);
5608
5609	BTRFS_I(inode)->root = btrfs_grab_root(root);
5610	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5611	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5612
5613	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5614	/*
5615	 * We only need lookup, the rest is read-only and there's no inode
5616	 * associated with the dentry
5617	 */
5618	inode->i_op = &simple_dir_inode_operations;
5619	inode->i_opflags &= ~IOP_XATTR;
5620	inode->i_fop = &simple_dir_operations;
5621	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5622	inode->i_mtime = inode_set_ctime_current(inode);
5623	inode->i_atime = dir->i_atime;
5624	BTRFS_I(inode)->i_otime = inode->i_mtime;
5625	inode->i_uid = dir->i_uid;
5626	inode->i_gid = dir->i_gid;
5627
5628	return inode;
5629}
5630
5631static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5632static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5633static_assert(BTRFS_FT_DIR == FT_DIR);
5634static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5635static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5636static_assert(BTRFS_FT_FIFO == FT_FIFO);
5637static_assert(BTRFS_FT_SOCK == FT_SOCK);
5638static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5639
5640static inline u8 btrfs_inode_type(struct inode *inode)
5641{
5642	return fs_umode_to_ftype(inode->i_mode);
5643}
5644
5645struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5646{
5647	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5648	struct inode *inode;
5649	struct btrfs_root *root = BTRFS_I(dir)->root;
5650	struct btrfs_root *sub_root = root;
5651	struct btrfs_key location;
5652	u8 di_type = 0;
5653	int ret = 0;
5654
5655	if (dentry->d_name.len > BTRFS_NAME_LEN)
5656		return ERR_PTR(-ENAMETOOLONG);
5657
5658	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5659	if (ret < 0)
5660		return ERR_PTR(ret);
5661
5662	if (location.type == BTRFS_INODE_ITEM_KEY) {
5663		inode = btrfs_iget(dir->i_sb, location.objectid, root);
5664		if (IS_ERR(inode))
5665			return inode;
5666
5667		/* Do extra check against inode mode with di_type */
5668		if (btrfs_inode_type(inode) != di_type) {
5669			btrfs_crit(fs_info,
5670"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5671				  inode->i_mode, btrfs_inode_type(inode),
5672				  di_type);
5673			iput(inode);
5674			return ERR_PTR(-EUCLEAN);
5675		}
5676		return inode;
5677	}
5678
5679	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5680				       &location, &sub_root);
5681	if (ret < 0) {
5682		if (ret != -ENOENT)
5683			inode = ERR_PTR(ret);
5684		else
5685			inode = new_simple_dir(dir, &location, root);
5686	} else {
5687		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5688		btrfs_put_root(sub_root);
5689
5690		if (IS_ERR(inode))
5691			return inode;
5692
5693		down_read(&fs_info->cleanup_work_sem);
5694		if (!sb_rdonly(inode->i_sb))
5695			ret = btrfs_orphan_cleanup(sub_root);
5696		up_read(&fs_info->cleanup_work_sem);
5697		if (ret) {
5698			iput(inode);
5699			inode = ERR_PTR(ret);
5700		}
5701	}
5702
5703	return inode;
5704}
5705
5706static int btrfs_dentry_delete(const struct dentry *dentry)
5707{
5708	struct btrfs_root *root;
5709	struct inode *inode = d_inode(dentry);
5710
5711	if (!inode && !IS_ROOT(dentry))
5712		inode = d_inode(dentry->d_parent);
5713
5714	if (inode) {
5715		root = BTRFS_I(inode)->root;
5716		if (btrfs_root_refs(&root->root_item) == 0)
5717			return 1;
5718
5719		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5720			return 1;
5721	}
5722	return 0;
5723}
5724
5725static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5726				   unsigned int flags)
5727{
5728	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5729
5730	if (inode == ERR_PTR(-ENOENT))
5731		inode = NULL;
5732	return d_splice_alias(inode, dentry);
5733}
5734
5735/*
5736 * Find the highest existing sequence number in a directory and then set the
5737 * in-memory index_cnt variable to the first free sequence number.
5738 */
5739static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5740{
5741	struct btrfs_root *root = inode->root;
5742	struct btrfs_key key, found_key;
5743	struct btrfs_path *path;
5744	struct extent_buffer *leaf;
5745	int ret;
5746
5747	key.objectid = btrfs_ino(inode);
5748	key.type = BTRFS_DIR_INDEX_KEY;
5749	key.offset = (u64)-1;
5750
5751	path = btrfs_alloc_path();
5752	if (!path)
5753		return -ENOMEM;
5754
5755	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5756	if (ret < 0)
5757		goto out;
5758	/* FIXME: we should be able to handle this */
5759	if (ret == 0)
5760		goto out;
5761	ret = 0;
5762
5763	if (path->slots[0] == 0) {
5764		inode->index_cnt = BTRFS_DIR_START_INDEX;
5765		goto out;
5766	}
5767
5768	path->slots[0]--;
5769
5770	leaf = path->nodes[0];
5771	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5772
5773	if (found_key.objectid != btrfs_ino(inode) ||
5774	    found_key.type != BTRFS_DIR_INDEX_KEY) {
5775		inode->index_cnt = BTRFS_DIR_START_INDEX;
5776		goto out;
5777	}
5778
5779	inode->index_cnt = found_key.offset + 1;
5780out:
5781	btrfs_free_path(path);
5782	return ret;
5783}
5784
5785static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5786{
5787	int ret = 0;
5788
5789	btrfs_inode_lock(dir, 0);
5790	if (dir->index_cnt == (u64)-1) {
5791		ret = btrfs_inode_delayed_dir_index_count(dir);
5792		if (ret) {
5793			ret = btrfs_set_inode_index_count(dir);
5794			if (ret)
5795				goto out;
5796		}
5797	}
5798
5799	/* index_cnt is the index number of next new entry, so decrement it. */
5800	*index = dir->index_cnt - 1;
5801out:
5802	btrfs_inode_unlock(dir, 0);
5803
5804	return ret;
5805}
5806
5807/*
5808 * All this infrastructure exists because dir_emit can fault, and we are holding
5809 * the tree lock when doing readdir.  For now just allocate a buffer and copy
5810 * our information into that, and then dir_emit from the buffer.  This is
5811 * similar to what NFS does, only we don't keep the buffer around in pagecache
5812 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5813 * copy_to_user_inatomic so we don't have to worry about page faulting under the
5814 * tree lock.
5815 */
5816static int btrfs_opendir(struct inode *inode, struct file *file)
5817{
5818	struct btrfs_file_private *private;
5819	u64 last_index;
5820	int ret;
5821
5822	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5823	if (ret)
5824		return ret;
5825
5826	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5827	if (!private)
5828		return -ENOMEM;
5829	private->last_index = last_index;
5830	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5831	if (!private->filldir_buf) {
5832		kfree(private);
5833		return -ENOMEM;
5834	}
5835	file->private_data = private;
5836	return 0;
5837}
5838
5839static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5840{
5841	struct btrfs_file_private *private = file->private_data;
5842	int ret;
5843
5844	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5845				       &private->last_index);
5846	if (ret)
5847		return ret;
5848
5849	return generic_file_llseek(file, offset, whence);
5850}
5851
5852struct dir_entry {
5853	u64 ino;
5854	u64 offset;
5855	unsigned type;
5856	int name_len;
5857};
5858
5859static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5860{
5861	while (entries--) {
5862		struct dir_entry *entry = addr;
5863		char *name = (char *)(entry + 1);
5864
5865		ctx->pos = get_unaligned(&entry->offset);
5866		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5867					 get_unaligned(&entry->ino),
5868					 get_unaligned(&entry->type)))
5869			return 1;
5870		addr += sizeof(struct dir_entry) +
5871			get_unaligned(&entry->name_len);
5872		ctx->pos++;
5873	}
5874	return 0;
5875}
5876
5877static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5878{
5879	struct inode *inode = file_inode(file);
5880	struct btrfs_root *root = BTRFS_I(inode)->root;
5881	struct btrfs_file_private *private = file->private_data;
5882	struct btrfs_dir_item *di;
5883	struct btrfs_key key;
5884	struct btrfs_key found_key;
5885	struct btrfs_path *path;
5886	void *addr;
5887	LIST_HEAD(ins_list);
5888	LIST_HEAD(del_list);
5889	int ret;
5890	char *name_ptr;
5891	int name_len;
5892	int entries = 0;
5893	int total_len = 0;
5894	bool put = false;
5895	struct btrfs_key location;
5896
5897	if (!dir_emit_dots(file, ctx))
5898		return 0;
5899
5900	path = btrfs_alloc_path();
5901	if (!path)
5902		return -ENOMEM;
5903
5904	addr = private->filldir_buf;
5905	path->reada = READA_FORWARD;
5906
5907	put = btrfs_readdir_get_delayed_items(inode, private->last_index,
5908					      &ins_list, &del_list);
5909
5910again:
5911	key.type = BTRFS_DIR_INDEX_KEY;
5912	key.offset = ctx->pos;
5913	key.objectid = btrfs_ino(BTRFS_I(inode));
5914
5915	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5916		struct dir_entry *entry;
5917		struct extent_buffer *leaf = path->nodes[0];
5918		u8 ftype;
5919
5920		if (found_key.objectid != key.objectid)
5921			break;
5922		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5923			break;
5924		if (found_key.offset < ctx->pos)
5925			continue;
5926		if (found_key.offset > private->last_index)
5927			break;
5928		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5929			continue;
5930		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5931		name_len = btrfs_dir_name_len(leaf, di);
5932		if ((total_len + sizeof(struct dir_entry) + name_len) >=
5933		    PAGE_SIZE) {
5934			btrfs_release_path(path);
5935			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5936			if (ret)
5937				goto nopos;
5938			addr = private->filldir_buf;
5939			entries = 0;
5940			total_len = 0;
5941			goto again;
5942		}
5943
5944		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
5945		entry = addr;
5946		name_ptr = (char *)(entry + 1);
5947		read_extent_buffer(leaf, name_ptr,
5948				   (unsigned long)(di + 1), name_len);
5949		put_unaligned(name_len, &entry->name_len);
5950		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5951		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5952		put_unaligned(location.objectid, &entry->ino);
5953		put_unaligned(found_key.offset, &entry->offset);
5954		entries++;
5955		addr += sizeof(struct dir_entry) + name_len;
5956		total_len += sizeof(struct dir_entry) + name_len;
5957	}
5958	/* Catch error encountered during iteration */
5959	if (ret < 0)
5960		goto err;
5961
5962	btrfs_release_path(path);
5963
5964	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5965	if (ret)
5966		goto nopos;
5967
5968	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5969	if (ret)
5970		goto nopos;
5971
5972	/*
5973	 * Stop new entries from being returned after we return the last
5974	 * entry.
5975	 *
5976	 * New directory entries are assigned a strictly increasing
5977	 * offset.  This means that new entries created during readdir
5978	 * are *guaranteed* to be seen in the future by that readdir.
5979	 * This has broken buggy programs which operate on names as
5980	 * they're returned by readdir.  Until we re-use freed offsets
5981	 * we have this hack to stop new entries from being returned
5982	 * under the assumption that they'll never reach this huge
5983	 * offset.
5984	 *
5985	 * This is being careful not to overflow 32bit loff_t unless the
5986	 * last entry requires it because doing so has broken 32bit apps
5987	 * in the past.
5988	 */
5989	if (ctx->pos >= INT_MAX)
5990		ctx->pos = LLONG_MAX;
5991	else
5992		ctx->pos = INT_MAX;
5993nopos:
5994	ret = 0;
5995err:
5996	if (put)
5997		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
5998	btrfs_free_path(path);
5999	return ret;
6000}
6001
6002/*
6003 * This is somewhat expensive, updating the tree every time the
6004 * inode changes.  But, it is most likely to find the inode in cache.
6005 * FIXME, needs more benchmarking...there are no reasons other than performance
6006 * to keep or drop this code.
6007 */
6008static int btrfs_dirty_inode(struct btrfs_inode *inode)
6009{
6010	struct btrfs_root *root = inode->root;
6011	struct btrfs_fs_info *fs_info = root->fs_info;
6012	struct btrfs_trans_handle *trans;
6013	int ret;
6014
6015	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6016		return 0;
6017
6018	trans = btrfs_join_transaction(root);
6019	if (IS_ERR(trans))
6020		return PTR_ERR(trans);
6021
6022	ret = btrfs_update_inode(trans, root, inode);
6023	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
6024		/* whoops, lets try again with the full transaction */
6025		btrfs_end_transaction(trans);
6026		trans = btrfs_start_transaction(root, 1);
6027		if (IS_ERR(trans))
6028			return PTR_ERR(trans);
6029
6030		ret = btrfs_update_inode(trans, root, inode);
6031	}
6032	btrfs_end_transaction(trans);
6033	if (inode->delayed_node)
6034		btrfs_balance_delayed_items(fs_info);
6035
6036	return ret;
6037}
6038
6039/*
6040 * This is a copy of file_update_time.  We need this so we can return error on
6041 * ENOSPC for updating the inode in the case of file write and mmap writes.
6042 */
6043static int btrfs_update_time(struct inode *inode, int flags)
6044{
6045	struct btrfs_root *root = BTRFS_I(inode)->root;
6046	bool dirty = flags & ~S_VERSION;
6047
6048	if (btrfs_root_readonly(root))
6049		return -EROFS;
6050
6051	dirty = inode_update_timestamps(inode, flags);
6052	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6053}
6054
6055/*
6056 * helper to find a free sequence number in a given directory.  This current
6057 * code is very simple, later versions will do smarter things in the btree
6058 */
6059int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6060{
6061	int ret = 0;
6062
6063	if (dir->index_cnt == (u64)-1) {
6064		ret = btrfs_inode_delayed_dir_index_count(dir);
6065		if (ret) {
6066			ret = btrfs_set_inode_index_count(dir);
6067			if (ret)
6068				return ret;
6069		}
6070	}
6071
6072	*index = dir->index_cnt;
6073	dir->index_cnt++;
6074
6075	return ret;
6076}
6077
6078static int btrfs_insert_inode_locked(struct inode *inode)
6079{
6080	struct btrfs_iget_args args;
6081
6082	args.ino = BTRFS_I(inode)->location.objectid;
6083	args.root = BTRFS_I(inode)->root;
6084
6085	return insert_inode_locked4(inode,
6086		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6087		   btrfs_find_actor, &args);
6088}
6089
6090int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6091			    unsigned int *trans_num_items)
6092{
6093	struct inode *dir = args->dir;
6094	struct inode *inode = args->inode;
6095	int ret;
6096
6097	if (!args->orphan) {
6098		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6099					     &args->fname);
6100		if (ret)
6101			return ret;
6102	}
6103
6104	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6105	if (ret) {
6106		fscrypt_free_filename(&args->fname);
6107		return ret;
6108	}
6109
6110	/* 1 to add inode item */
6111	*trans_num_items = 1;
6112	/* 1 to add compression property */
6113	if (BTRFS_I(dir)->prop_compress)
6114		(*trans_num_items)++;
6115	/* 1 to add default ACL xattr */
6116	if (args->default_acl)
6117		(*trans_num_items)++;
6118	/* 1 to add access ACL xattr */
6119	if (args->acl)
6120		(*trans_num_items)++;
6121#ifdef CONFIG_SECURITY
6122	/* 1 to add LSM xattr */
6123	if (dir->i_security)
6124		(*trans_num_items)++;
6125#endif
6126	if (args->orphan) {
6127		/* 1 to add orphan item */
6128		(*trans_num_items)++;
6129	} else {
6130		/*
6131		 * 1 to add dir item
6132		 * 1 to add dir index
6133		 * 1 to update parent inode item
6134		 *
6135		 * No need for 1 unit for the inode ref item because it is
6136		 * inserted in a batch together with the inode item at
6137		 * btrfs_create_new_inode().
6138		 */
6139		*trans_num_items += 3;
6140	}
6141	return 0;
6142}
6143
6144void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6145{
6146	posix_acl_release(args->acl);
6147	posix_acl_release(args->default_acl);
6148	fscrypt_free_filename(&args->fname);
6149}
6150
6151/*
6152 * Inherit flags from the parent inode.
6153 *
6154 * Currently only the compression flags and the cow flags are inherited.
6155 */
6156static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6157{
6158	unsigned int flags;
6159
6160	flags = dir->flags;
6161
6162	if (flags & BTRFS_INODE_NOCOMPRESS) {
6163		inode->flags &= ~BTRFS_INODE_COMPRESS;
6164		inode->flags |= BTRFS_INODE_NOCOMPRESS;
6165	} else if (flags & BTRFS_INODE_COMPRESS) {
6166		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6167		inode->flags |= BTRFS_INODE_COMPRESS;
6168	}
6169
6170	if (flags & BTRFS_INODE_NODATACOW) {
6171		inode->flags |= BTRFS_INODE_NODATACOW;
6172		if (S_ISREG(inode->vfs_inode.i_mode))
6173			inode->flags |= BTRFS_INODE_NODATASUM;
6174	}
6175
6176	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
6177}
6178
6179int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6180			   struct btrfs_new_inode_args *args)
6181{
6182	struct inode *dir = args->dir;
6183	struct inode *inode = args->inode;
6184	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6185	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6186	struct btrfs_root *root;
6187	struct btrfs_inode_item *inode_item;
6188	struct btrfs_key *location;
6189	struct btrfs_path *path;
6190	u64 objectid;
6191	struct btrfs_inode_ref *ref;
6192	struct btrfs_key key[2];
6193	u32 sizes[2];
6194	struct btrfs_item_batch batch;
6195	unsigned long ptr;
6196	int ret;
6197
6198	path = btrfs_alloc_path();
6199	if (!path)
6200		return -ENOMEM;
6201
6202	if (!args->subvol)
6203		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6204	root = BTRFS_I(inode)->root;
6205
6206	ret = btrfs_get_free_objectid(root, &objectid);
6207	if (ret)
6208		goto out;
6209	inode->i_ino = objectid;
6210
6211	if (args->orphan) {
6212		/*
6213		 * O_TMPFILE, set link count to 0, so that after this point, we
6214		 * fill in an inode item with the correct link count.
6215		 */
6216		set_nlink(inode, 0);
6217	} else {
6218		trace_btrfs_inode_request(dir);
6219
6220		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6221		if (ret)
6222			goto out;
6223	}
6224	/* index_cnt is ignored for everything but a dir. */
6225	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6226	BTRFS_I(inode)->generation = trans->transid;
6227	inode->i_generation = BTRFS_I(inode)->generation;
6228
6229	/*
6230	 * Subvolumes don't inherit flags from their parent directory.
6231	 * Originally this was probably by accident, but we probably can't
6232	 * change it now without compatibility issues.
6233	 */
6234	if (!args->subvol)
6235		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6236
6237	if (S_ISREG(inode->i_mode)) {
6238		if (btrfs_test_opt(fs_info, NODATASUM))
6239			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6240		if (btrfs_test_opt(fs_info, NODATACOW))
6241			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6242				BTRFS_INODE_NODATASUM;
6243	}
6244
6245	location = &BTRFS_I(inode)->location;
6246	location->objectid = objectid;
6247	location->offset = 0;
6248	location->type = BTRFS_INODE_ITEM_KEY;
6249
6250	ret = btrfs_insert_inode_locked(inode);
6251	if (ret < 0) {
6252		if (!args->orphan)
6253			BTRFS_I(dir)->index_cnt--;
6254		goto out;
6255	}
6256
6257	/*
6258	 * We could have gotten an inode number from somebody who was fsynced
6259	 * and then removed in this same transaction, so let's just set full
6260	 * sync since it will be a full sync anyway and this will blow away the
6261	 * old info in the log.
6262	 */
6263	btrfs_set_inode_full_sync(BTRFS_I(inode));
6264
6265	key[0].objectid = objectid;
6266	key[0].type = BTRFS_INODE_ITEM_KEY;
6267	key[0].offset = 0;
6268
6269	sizes[0] = sizeof(struct btrfs_inode_item);
6270
6271	if (!args->orphan) {
6272		/*
6273		 * Start new inodes with an inode_ref. This is slightly more
6274		 * efficient for small numbers of hard links since they will
6275		 * be packed into one item. Extended refs will kick in if we
6276		 * add more hard links than can fit in the ref item.
6277		 */
6278		key[1].objectid = objectid;
6279		key[1].type = BTRFS_INODE_REF_KEY;
6280		if (args->subvol) {
6281			key[1].offset = objectid;
6282			sizes[1] = 2 + sizeof(*ref);
6283		} else {
6284			key[1].offset = btrfs_ino(BTRFS_I(dir));
6285			sizes[1] = name->len + sizeof(*ref);
6286		}
6287	}
6288
6289	batch.keys = &key[0];
6290	batch.data_sizes = &sizes[0];
6291	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6292	batch.nr = args->orphan ? 1 : 2;
6293	ret = btrfs_insert_empty_items(trans, root, path, &batch);
6294	if (ret != 0) {
6295		btrfs_abort_transaction(trans, ret);
6296		goto discard;
6297	}
6298
6299	inode->i_mtime = inode_set_ctime_current(inode);
6300	inode->i_atime = inode->i_mtime;
6301	BTRFS_I(inode)->i_otime = inode->i_mtime;
6302
6303	/*
6304	 * We're going to fill the inode item now, so at this point the inode
6305	 * must be fully initialized.
6306	 */
6307
6308	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6309				  struct btrfs_inode_item);
6310	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6311			     sizeof(*inode_item));
6312	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6313
6314	if (!args->orphan) {
6315		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6316				     struct btrfs_inode_ref);
6317		ptr = (unsigned long)(ref + 1);
6318		if (args->subvol) {
6319			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6320			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6321			write_extent_buffer(path->nodes[0], "..", ptr, 2);
6322		} else {
6323			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6324						     name->len);
6325			btrfs_set_inode_ref_index(path->nodes[0], ref,
6326						  BTRFS_I(inode)->dir_index);
6327			write_extent_buffer(path->nodes[0], name->name, ptr,
6328					    name->len);
6329		}
6330	}
6331
6332	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
6333	/*
6334	 * We don't need the path anymore, plus inheriting properties, adding
6335	 * ACLs, security xattrs, orphan item or adding the link, will result in
6336	 * allocating yet another path. So just free our path.
6337	 */
6338	btrfs_free_path(path);
6339	path = NULL;
6340
6341	if (args->subvol) {
6342		struct inode *parent;
6343
6344		/*
6345		 * Subvolumes inherit properties from their parent subvolume,
6346		 * not the directory they were created in.
6347		 */
6348		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
6349				    BTRFS_I(dir)->root);
6350		if (IS_ERR(parent)) {
6351			ret = PTR_ERR(parent);
6352		} else {
6353			ret = btrfs_inode_inherit_props(trans, inode, parent);
6354			iput(parent);
6355		}
6356	} else {
6357		ret = btrfs_inode_inherit_props(trans, inode, dir);
6358	}
6359	if (ret) {
6360		btrfs_err(fs_info,
6361			  "error inheriting props for ino %llu (root %llu): %d",
6362			  btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
6363			  ret);
6364	}
6365
6366	/*
6367	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6368	 * probably a bug.
6369	 */
6370	if (!args->subvol) {
6371		ret = btrfs_init_inode_security(trans, args);
6372		if (ret) {
6373			btrfs_abort_transaction(trans, ret);
6374			goto discard;
6375		}
6376	}
6377
6378	inode_tree_add(BTRFS_I(inode));
6379
6380	trace_btrfs_inode_new(inode);
6381	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6382
6383	btrfs_update_root_times(trans, root);
6384
6385	if (args->orphan) {
6386		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6387	} else {
6388		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6389				     0, BTRFS_I(inode)->dir_index);
6390	}
6391	if (ret) {
6392		btrfs_abort_transaction(trans, ret);
6393		goto discard;
6394	}
6395
6396	return 0;
6397
6398discard:
6399	/*
6400	 * discard_new_inode() calls iput(), but the caller owns the reference
6401	 * to the inode.
6402	 */
6403	ihold(inode);
6404	discard_new_inode(inode);
6405out:
6406	btrfs_free_path(path);
6407	return ret;
6408}
6409
6410/*
6411 * utility function to add 'inode' into 'parent_inode' with
6412 * a give name and a given sequence number.
6413 * if 'add_backref' is true, also insert a backref from the
6414 * inode to the parent directory.
6415 */
6416int btrfs_add_link(struct btrfs_trans_handle *trans,
6417		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6418		   const struct fscrypt_str *name, int add_backref, u64 index)
6419{
6420	int ret = 0;
6421	struct btrfs_key key;
6422	struct btrfs_root *root = parent_inode->root;
6423	u64 ino = btrfs_ino(inode);
6424	u64 parent_ino = btrfs_ino(parent_inode);
6425
6426	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6427		memcpy(&key, &inode->root->root_key, sizeof(key));
6428	} else {
6429		key.objectid = ino;
6430		key.type = BTRFS_INODE_ITEM_KEY;
6431		key.offset = 0;
6432	}
6433
6434	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6435		ret = btrfs_add_root_ref(trans, key.objectid,
6436					 root->root_key.objectid, parent_ino,
6437					 index, name);
6438	} else if (add_backref) {
6439		ret = btrfs_insert_inode_ref(trans, root, name,
6440					     ino, parent_ino, index);
6441	}
6442
6443	/* Nothing to clean up yet */
6444	if (ret)
6445		return ret;
6446
6447	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6448				    btrfs_inode_type(&inode->vfs_inode), index);
6449	if (ret == -EEXIST || ret == -EOVERFLOW)
6450		goto fail_dir_item;
6451	else if (ret) {
6452		btrfs_abort_transaction(trans, ret);
6453		return ret;
6454	}
6455
6456	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6457			   name->len * 2);
6458	inode_inc_iversion(&parent_inode->vfs_inode);
6459	/*
6460	 * If we are replaying a log tree, we do not want to update the mtime
6461	 * and ctime of the parent directory with the current time, since the
6462	 * log replay procedure is responsible for setting them to their correct
6463	 * values (the ones it had when the fsync was done).
6464	 */
6465	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6466		parent_inode->vfs_inode.i_mtime =
6467			inode_set_ctime_current(&parent_inode->vfs_inode);
6468
6469	ret = btrfs_update_inode(trans, root, parent_inode);
6470	if (ret)
6471		btrfs_abort_transaction(trans, ret);
6472	return ret;
6473
6474fail_dir_item:
6475	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6476		u64 local_index;
6477		int err;
6478		err = btrfs_del_root_ref(trans, key.objectid,
6479					 root->root_key.objectid, parent_ino,
6480					 &local_index, name);
6481		if (err)
6482			btrfs_abort_transaction(trans, err);
6483	} else if (add_backref) {
6484		u64 local_index;
6485		int err;
6486
6487		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6488					  &local_index);
6489		if (err)
6490			btrfs_abort_transaction(trans, err);
6491	}
6492
6493	/* Return the original error code */
6494	return ret;
6495}
6496
6497static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6498			       struct inode *inode)
6499{
6500	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6501	struct btrfs_root *root = BTRFS_I(dir)->root;
6502	struct btrfs_new_inode_args new_inode_args = {
6503		.dir = dir,
6504		.dentry = dentry,
6505		.inode = inode,
6506	};
6507	unsigned int trans_num_items;
6508	struct btrfs_trans_handle *trans;
6509	int err;
6510
6511	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6512	if (err)
6513		goto out_inode;
6514
6515	trans = btrfs_start_transaction(root, trans_num_items);
6516	if (IS_ERR(trans)) {
6517		err = PTR_ERR(trans);
6518		goto out_new_inode_args;
6519	}
6520
6521	err = btrfs_create_new_inode(trans, &new_inode_args);
6522	if (!err)
6523		d_instantiate_new(dentry, inode);
6524
6525	btrfs_end_transaction(trans);
6526	btrfs_btree_balance_dirty(fs_info);
6527out_new_inode_args:
6528	btrfs_new_inode_args_destroy(&new_inode_args);
6529out_inode:
6530	if (err)
6531		iput(inode);
6532	return err;
6533}
6534
6535static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6536		       struct dentry *dentry, umode_t mode, dev_t rdev)
6537{
6538	struct inode *inode;
6539
6540	inode = new_inode(dir->i_sb);
6541	if (!inode)
6542		return -ENOMEM;
6543	inode_init_owner(idmap, inode, dir, mode);
6544	inode->i_op = &btrfs_special_inode_operations;
6545	init_special_inode(inode, inode->i_mode, rdev);
6546	return btrfs_create_common(dir, dentry, inode);
6547}
6548
6549static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6550			struct dentry *dentry, umode_t mode, bool excl)
6551{
6552	struct inode *inode;
6553
6554	inode = new_inode(dir->i_sb);
6555	if (!inode)
6556		return -ENOMEM;
6557	inode_init_owner(idmap, inode, dir, mode);
6558	inode->i_fop = &btrfs_file_operations;
6559	inode->i_op = &btrfs_file_inode_operations;
6560	inode->i_mapping->a_ops = &btrfs_aops;
6561	return btrfs_create_common(dir, dentry, inode);
6562}
6563
6564static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6565		      struct dentry *dentry)
6566{
6567	struct btrfs_trans_handle *trans = NULL;
6568	struct btrfs_root *root = BTRFS_I(dir)->root;
6569	struct inode *inode = d_inode(old_dentry);
6570	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6571	struct fscrypt_name fname;
6572	u64 index;
6573	int err;
6574	int drop_inode = 0;
6575
6576	/* do not allow sys_link's with other subvols of the same device */
6577	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6578		return -EXDEV;
6579
6580	if (inode->i_nlink >= BTRFS_LINK_MAX)
6581		return -EMLINK;
6582
6583	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6584	if (err)
6585		goto fail;
6586
6587	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6588	if (err)
6589		goto fail;
6590
6591	/*
6592	 * 2 items for inode and inode ref
6593	 * 2 items for dir items
6594	 * 1 item for parent inode
6595	 * 1 item for orphan item deletion if O_TMPFILE
6596	 */
6597	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6598	if (IS_ERR(trans)) {
6599		err = PTR_ERR(trans);
6600		trans = NULL;
6601		goto fail;
6602	}
6603
6604	/* There are several dir indexes for this inode, clear the cache. */
6605	BTRFS_I(inode)->dir_index = 0ULL;
6606	inc_nlink(inode);
6607	inode_inc_iversion(inode);
6608	inode_set_ctime_current(inode);
6609	ihold(inode);
6610	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6611
6612	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6613			     &fname.disk_name, 1, index);
6614
6615	if (err) {
6616		drop_inode = 1;
6617	} else {
6618		struct dentry *parent = dentry->d_parent;
6619
6620		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6621		if (err)
6622			goto fail;
6623		if (inode->i_nlink == 1) {
6624			/*
6625			 * If new hard link count is 1, it's a file created
6626			 * with open(2) O_TMPFILE flag.
6627			 */
6628			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6629			if (err)
6630				goto fail;
6631		}
6632		d_instantiate(dentry, inode);
6633		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6634	}
6635
6636fail:
6637	fscrypt_free_filename(&fname);
6638	if (trans)
6639		btrfs_end_transaction(trans);
6640	if (drop_inode) {
6641		inode_dec_link_count(inode);
6642		iput(inode);
6643	}
6644	btrfs_btree_balance_dirty(fs_info);
6645	return err;
6646}
6647
6648static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6649		       struct dentry *dentry, umode_t mode)
6650{
6651	struct inode *inode;
6652
6653	inode = new_inode(dir->i_sb);
6654	if (!inode)
6655		return -ENOMEM;
6656	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6657	inode->i_op = &btrfs_dir_inode_operations;
6658	inode->i_fop = &btrfs_dir_file_operations;
6659	return btrfs_create_common(dir, dentry, inode);
6660}
6661
6662static noinline int uncompress_inline(struct btrfs_path *path,
6663				      struct page *page,
6664				      struct btrfs_file_extent_item *item)
6665{
6666	int ret;
6667	struct extent_buffer *leaf = path->nodes[0];
6668	char *tmp;
6669	size_t max_size;
6670	unsigned long inline_size;
6671	unsigned long ptr;
6672	int compress_type;
6673
6674	compress_type = btrfs_file_extent_compression(leaf, item);
6675	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6676	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6677	tmp = kmalloc(inline_size, GFP_NOFS);
6678	if (!tmp)
6679		return -ENOMEM;
6680	ptr = btrfs_file_extent_inline_start(item);
6681
6682	read_extent_buffer(leaf, tmp, ptr, inline_size);
6683
6684	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6685	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
6686
6687	/*
6688	 * decompression code contains a memset to fill in any space between the end
6689	 * of the uncompressed data and the end of max_size in case the decompressed
6690	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6691	 * the end of an inline extent and the beginning of the next block, so we
6692	 * cover that region here.
6693	 */
6694
6695	if (max_size < PAGE_SIZE)
6696		memzero_page(page, max_size, PAGE_SIZE - max_size);
6697	kfree(tmp);
6698	return ret;
6699}
6700
6701static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6702			      struct page *page)
6703{
6704	struct btrfs_file_extent_item *fi;
6705	void *kaddr;
6706	size_t copy_size;
6707
6708	if (!page || PageUptodate(page))
6709		return 0;
6710
6711	ASSERT(page_offset(page) == 0);
6712
6713	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6714			    struct btrfs_file_extent_item);
6715	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6716		return uncompress_inline(path, page, fi);
6717
6718	copy_size = min_t(u64, PAGE_SIZE,
6719			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6720	kaddr = kmap_local_page(page);
6721	read_extent_buffer(path->nodes[0], kaddr,
6722			   btrfs_file_extent_inline_start(fi), copy_size);
6723	kunmap_local(kaddr);
6724	if (copy_size < PAGE_SIZE)
6725		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
6726	return 0;
6727}
6728
6729/*
6730 * Lookup the first extent overlapping a range in a file.
6731 *
6732 * @inode:	file to search in
6733 * @page:	page to read extent data into if the extent is inline
6734 * @pg_offset:	offset into @page to copy to
6735 * @start:	file offset
6736 * @len:	length of range starting at @start
6737 *
6738 * Return the first &struct extent_map which overlaps the given range, reading
6739 * it from the B-tree and caching it if necessary. Note that there may be more
6740 * extents which overlap the given range after the returned extent_map.
6741 *
6742 * If @page is not NULL and the extent is inline, this also reads the extent
6743 * data directly into the page and marks the extent up to date in the io_tree.
6744 *
6745 * Return: ERR_PTR on error, non-NULL extent_map on success.
6746 */
6747struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6748				    struct page *page, size_t pg_offset,
6749				    u64 start, u64 len)
6750{
6751	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6752	int ret = 0;
6753	u64 extent_start = 0;
6754	u64 extent_end = 0;
6755	u64 objectid = btrfs_ino(inode);
6756	int extent_type = -1;
6757	struct btrfs_path *path = NULL;
6758	struct btrfs_root *root = inode->root;
6759	struct btrfs_file_extent_item *item;
6760	struct extent_buffer *leaf;
6761	struct btrfs_key found_key;
6762	struct extent_map *em = NULL;
6763	struct extent_map_tree *em_tree = &inode->extent_tree;
6764
6765	read_lock(&em_tree->lock);
6766	em = lookup_extent_mapping(em_tree, start, len);
6767	read_unlock(&em_tree->lock);
6768
6769	if (em) {
6770		if (em->start > start || em->start + em->len <= start)
6771			free_extent_map(em);
6772		else if (em->block_start == EXTENT_MAP_INLINE && page)
6773			free_extent_map(em);
6774		else
6775			goto out;
6776	}
6777	em = alloc_extent_map();
6778	if (!em) {
6779		ret = -ENOMEM;
6780		goto out;
6781	}
6782	em->start = EXTENT_MAP_HOLE;
6783	em->orig_start = EXTENT_MAP_HOLE;
6784	em->len = (u64)-1;
6785	em->block_len = (u64)-1;
6786
6787	path = btrfs_alloc_path();
6788	if (!path) {
6789		ret = -ENOMEM;
6790		goto out;
6791	}
6792
6793	/* Chances are we'll be called again, so go ahead and do readahead */
6794	path->reada = READA_FORWARD;
6795
6796	/*
6797	 * The same explanation in load_free_space_cache applies here as well,
6798	 * we only read when we're loading the free space cache, and at that
6799	 * point the commit_root has everything we need.
6800	 */
6801	if (btrfs_is_free_space_inode(inode)) {
6802		path->search_commit_root = 1;
6803		path->skip_locking = 1;
6804	}
6805
6806	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6807	if (ret < 0) {
6808		goto out;
6809	} else if (ret > 0) {
6810		if (path->slots[0] == 0)
6811			goto not_found;
6812		path->slots[0]--;
6813		ret = 0;
6814	}
6815
6816	leaf = path->nodes[0];
6817	item = btrfs_item_ptr(leaf, path->slots[0],
6818			      struct btrfs_file_extent_item);
6819	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6820	if (found_key.objectid != objectid ||
6821	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6822		/*
6823		 * If we backup past the first extent we want to move forward
6824		 * and see if there is an extent in front of us, otherwise we'll
6825		 * say there is a hole for our whole search range which can
6826		 * cause problems.
6827		 */
6828		extent_end = start;
6829		goto next;
6830	}
6831
6832	extent_type = btrfs_file_extent_type(leaf, item);
6833	extent_start = found_key.offset;
6834	extent_end = btrfs_file_extent_end(path);
6835	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6836	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6837		/* Only regular file could have regular/prealloc extent */
6838		if (!S_ISREG(inode->vfs_inode.i_mode)) {
6839			ret = -EUCLEAN;
6840			btrfs_crit(fs_info,
6841		"regular/prealloc extent found for non-regular inode %llu",
6842				   btrfs_ino(inode));
6843			goto out;
6844		}
6845		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6846						       extent_start);
6847	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6848		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6849						      path->slots[0],
6850						      extent_start);
6851	}
6852next:
6853	if (start >= extent_end) {
6854		path->slots[0]++;
6855		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6856			ret = btrfs_next_leaf(root, path);
6857			if (ret < 0)
6858				goto out;
6859			else if (ret > 0)
6860				goto not_found;
6861
6862			leaf = path->nodes[0];
6863		}
6864		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6865		if (found_key.objectid != objectid ||
6866		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6867			goto not_found;
6868		if (start + len <= found_key.offset)
6869			goto not_found;
6870		if (start > found_key.offset)
6871			goto next;
6872
6873		/* New extent overlaps with existing one */
6874		em->start = start;
6875		em->orig_start = start;
6876		em->len = found_key.offset - start;
6877		em->block_start = EXTENT_MAP_HOLE;
6878		goto insert;
6879	}
6880
6881	btrfs_extent_item_to_extent_map(inode, path, item, em);
6882
6883	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6884	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6885		goto insert;
6886	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6887		/*
6888		 * Inline extent can only exist at file offset 0. This is
6889		 * ensured by tree-checker and inline extent creation path.
6890		 * Thus all members representing file offsets should be zero.
6891		 */
6892		ASSERT(pg_offset == 0);
6893		ASSERT(extent_start == 0);
6894		ASSERT(em->start == 0);
6895
6896		/*
6897		 * btrfs_extent_item_to_extent_map() should have properly
6898		 * initialized em members already.
6899		 *
6900		 * Other members are not utilized for inline extents.
6901		 */
6902		ASSERT(em->block_start == EXTENT_MAP_INLINE);
6903		ASSERT(em->len == fs_info->sectorsize);
6904
6905		ret = read_inline_extent(inode, path, page);
6906		if (ret < 0)
6907			goto out;
6908		goto insert;
6909	}
6910not_found:
6911	em->start = start;
6912	em->orig_start = start;
6913	em->len = len;
6914	em->block_start = EXTENT_MAP_HOLE;
6915insert:
6916	ret = 0;
6917	btrfs_release_path(path);
6918	if (em->start > start || extent_map_end(em) <= start) {
6919		btrfs_err(fs_info,
6920			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
6921			  em->start, em->len, start, len);
6922		ret = -EIO;
6923		goto out;
6924	}
6925
6926	write_lock(&em_tree->lock);
6927	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6928	write_unlock(&em_tree->lock);
6929out:
6930	btrfs_free_path(path);
6931
6932	trace_btrfs_get_extent(root, inode, em);
6933
6934	if (ret) {
6935		free_extent_map(em);
6936		return ERR_PTR(ret);
6937	}
6938	return em;
6939}
6940
6941static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
6942						  struct btrfs_dio_data *dio_data,
6943						  const u64 start,
6944						  const u64 len,
6945						  const u64 orig_start,
6946						  const u64 block_start,
6947						  const u64 block_len,
6948						  const u64 orig_block_len,
6949						  const u64 ram_bytes,
6950						  const int type)
6951{
6952	struct extent_map *em = NULL;
6953	struct btrfs_ordered_extent *ordered;
6954
6955	if (type != BTRFS_ORDERED_NOCOW) {
6956		em = create_io_em(inode, start, len, orig_start, block_start,
6957				  block_len, orig_block_len, ram_bytes,
6958				  BTRFS_COMPRESS_NONE, /* compress_type */
6959				  type);
6960		if (IS_ERR(em))
6961			goto out;
6962	}
6963	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
6964					     block_start, block_len, 0,
6965					     (1 << type) |
6966					     (1 << BTRFS_ORDERED_DIRECT),
6967					     BTRFS_COMPRESS_NONE);
6968	if (IS_ERR(ordered)) {
6969		if (em) {
6970			free_extent_map(em);
6971			btrfs_drop_extent_map_range(inode, start,
6972						    start + len - 1, false);
6973		}
6974		em = ERR_CAST(ordered);
6975	} else {
6976		ASSERT(!dio_data->ordered);
6977		dio_data->ordered = ordered;
6978	}
6979 out:
6980
6981	return em;
6982}
6983
6984static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
6985						  struct btrfs_dio_data *dio_data,
6986						  u64 start, u64 len)
6987{
6988	struct btrfs_root *root = inode->root;
6989	struct btrfs_fs_info *fs_info = root->fs_info;
6990	struct extent_map *em;
6991	struct btrfs_key ins;
6992	u64 alloc_hint;
6993	int ret;
6994
6995	alloc_hint = get_extent_allocation_hint(inode, start, len);
6996again:
6997	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
6998				   0, alloc_hint, &ins, 1, 1);
6999	if (ret == -EAGAIN) {
7000		ASSERT(btrfs_is_zoned(fs_info));
7001		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
7002			       TASK_UNINTERRUPTIBLE);
7003		goto again;
7004	}
7005	if (ret)
7006		return ERR_PTR(ret);
7007
7008	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
7009				     ins.objectid, ins.offset, ins.offset,
7010				     ins.offset, BTRFS_ORDERED_REGULAR);
7011	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7012	if (IS_ERR(em))
7013		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7014					   1);
7015
7016	return em;
7017}
7018
7019static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7020{
7021	struct btrfs_block_group *block_group;
7022	bool readonly = false;
7023
7024	block_group = btrfs_lookup_block_group(fs_info, bytenr);
7025	if (!block_group || block_group->ro)
7026		readonly = true;
7027	if (block_group)
7028		btrfs_put_block_group(block_group);
7029	return readonly;
7030}
7031
7032/*
7033 * Check if we can do nocow write into the range [@offset, @offset + @len)
7034 *
7035 * @offset:	File offset
7036 * @len:	The length to write, will be updated to the nocow writeable
7037 *		range
7038 * @orig_start:	(optional) Return the original file offset of the file extent
7039 * @orig_len:	(optional) Return the original on-disk length of the file extent
7040 * @ram_bytes:	(optional) Return the ram_bytes of the file extent
7041 * @strict:	if true, omit optimizations that might force us into unnecessary
7042 *		cow. e.g., don't trust generation number.
7043 *
7044 * Return:
7045 * >0	and update @len if we can do nocow write
7046 *  0	if we can't do nocow write
7047 * <0	if error happened
7048 *
7049 * NOTE: This only checks the file extents, caller is responsible to wait for
7050 *	 any ordered extents.
7051 */
7052noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7053			      u64 *orig_start, u64 *orig_block_len,
7054			      u64 *ram_bytes, bool nowait, bool strict)
7055{
7056	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7057	struct can_nocow_file_extent_args nocow_args = { 0 };
7058	struct btrfs_path *path;
7059	int ret;
7060	struct extent_buffer *leaf;
7061	struct btrfs_root *root = BTRFS_I(inode)->root;
7062	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7063	struct btrfs_file_extent_item *fi;
7064	struct btrfs_key key;
7065	int found_type;
7066
7067	path = btrfs_alloc_path();
7068	if (!path)
7069		return -ENOMEM;
7070	path->nowait = nowait;
7071
7072	ret = btrfs_lookup_file_extent(NULL, root, path,
7073			btrfs_ino(BTRFS_I(inode)), offset, 0);
7074	if (ret < 0)
7075		goto out;
7076
7077	if (ret == 1) {
7078		if (path->slots[0] == 0) {
7079			/* can't find the item, must cow */
7080			ret = 0;
7081			goto out;
7082		}
7083		path->slots[0]--;
7084	}
7085	ret = 0;
7086	leaf = path->nodes[0];
7087	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7088	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7089	    key.type != BTRFS_EXTENT_DATA_KEY) {
7090		/* not our file or wrong item type, must cow */
7091		goto out;
7092	}
7093
7094	if (key.offset > offset) {
7095		/* Wrong offset, must cow */
7096		goto out;
7097	}
7098
7099	if (btrfs_file_extent_end(path) <= offset)
7100		goto out;
7101
7102	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7103	found_type = btrfs_file_extent_type(leaf, fi);
7104	if (ram_bytes)
7105		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7106
7107	nocow_args.start = offset;
7108	nocow_args.end = offset + *len - 1;
7109	nocow_args.strict = strict;
7110	nocow_args.free_path = true;
7111
7112	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7113	/* can_nocow_file_extent() has freed the path. */
7114	path = NULL;
7115
7116	if (ret != 1) {
7117		/* Treat errors as not being able to NOCOW. */
7118		ret = 0;
7119		goto out;
7120	}
7121
7122	ret = 0;
7123	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
7124		goto out;
7125
7126	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7127	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7128		u64 range_end;
7129
7130		range_end = round_up(offset + nocow_args.num_bytes,
7131				     root->fs_info->sectorsize) - 1;
7132		ret = test_range_bit(io_tree, offset, range_end,
7133				     EXTENT_DELALLOC, 0, NULL);
7134		if (ret) {
7135			ret = -EAGAIN;
7136			goto out;
7137		}
7138	}
7139
7140	if (orig_start)
7141		*orig_start = key.offset - nocow_args.extent_offset;
7142	if (orig_block_len)
7143		*orig_block_len = nocow_args.disk_num_bytes;
7144
7145	*len = nocow_args.num_bytes;
7146	ret = 1;
7147out:
7148	btrfs_free_path(path);
7149	return ret;
7150}
7151
7152static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7153			      struct extent_state **cached_state,
7154			      unsigned int iomap_flags)
7155{
7156	const bool writing = (iomap_flags & IOMAP_WRITE);
7157	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7158	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7159	struct btrfs_ordered_extent *ordered;
7160	int ret = 0;
7161
7162	while (1) {
7163		if (nowait) {
7164			if (!try_lock_extent(io_tree, lockstart, lockend,
7165					     cached_state))
7166				return -EAGAIN;
7167		} else {
7168			lock_extent(io_tree, lockstart, lockend, cached_state);
7169		}
7170		/*
7171		 * We're concerned with the entire range that we're going to be
7172		 * doing DIO to, so we need to make sure there's no ordered
7173		 * extents in this range.
7174		 */
7175		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7176						     lockend - lockstart + 1);
7177
7178		/*
7179		 * We need to make sure there are no buffered pages in this
7180		 * range either, we could have raced between the invalidate in
7181		 * generic_file_direct_write and locking the extent.  The
7182		 * invalidate needs to happen so that reads after a write do not
7183		 * get stale data.
7184		 */
7185		if (!ordered &&
7186		    (!writing || !filemap_range_has_page(inode->i_mapping,
7187							 lockstart, lockend)))
7188			break;
7189
7190		unlock_extent(io_tree, lockstart, lockend, cached_state);
7191
7192		if (ordered) {
7193			if (nowait) {
7194				btrfs_put_ordered_extent(ordered);
7195				ret = -EAGAIN;
7196				break;
7197			}
7198			/*
7199			 * If we are doing a DIO read and the ordered extent we
7200			 * found is for a buffered write, we can not wait for it
7201			 * to complete and retry, because if we do so we can
7202			 * deadlock with concurrent buffered writes on page
7203			 * locks. This happens only if our DIO read covers more
7204			 * than one extent map, if at this point has already
7205			 * created an ordered extent for a previous extent map
7206			 * and locked its range in the inode's io tree, and a
7207			 * concurrent write against that previous extent map's
7208			 * range and this range started (we unlock the ranges
7209			 * in the io tree only when the bios complete and
7210			 * buffered writes always lock pages before attempting
7211			 * to lock range in the io tree).
7212			 */
7213			if (writing ||
7214			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7215				btrfs_start_ordered_extent(ordered);
7216			else
7217				ret = nowait ? -EAGAIN : -ENOTBLK;
7218			btrfs_put_ordered_extent(ordered);
7219		} else {
7220			/*
7221			 * We could trigger writeback for this range (and wait
7222			 * for it to complete) and then invalidate the pages for
7223			 * this range (through invalidate_inode_pages2_range()),
7224			 * but that can lead us to a deadlock with a concurrent
7225			 * call to readahead (a buffered read or a defrag call
7226			 * triggered a readahead) on a page lock due to an
7227			 * ordered dio extent we created before but did not have
7228			 * yet a corresponding bio submitted (whence it can not
7229			 * complete), which makes readahead wait for that
7230			 * ordered extent to complete while holding a lock on
7231			 * that page.
7232			 */
7233			ret = nowait ? -EAGAIN : -ENOTBLK;
7234		}
7235
7236		if (ret)
7237			break;
7238
7239		cond_resched();
7240	}
7241
7242	return ret;
7243}
7244
7245/* The callers of this must take lock_extent() */
7246static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7247				       u64 len, u64 orig_start, u64 block_start,
7248				       u64 block_len, u64 orig_block_len,
7249				       u64 ram_bytes, int compress_type,
7250				       int type)
7251{
7252	struct extent_map *em;
7253	int ret;
7254
7255	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7256	       type == BTRFS_ORDERED_COMPRESSED ||
7257	       type == BTRFS_ORDERED_NOCOW ||
7258	       type == BTRFS_ORDERED_REGULAR);
7259
7260	em = alloc_extent_map();
7261	if (!em)
7262		return ERR_PTR(-ENOMEM);
7263
7264	em->start = start;
7265	em->orig_start = orig_start;
7266	em->len = len;
7267	em->block_len = block_len;
7268	em->block_start = block_start;
7269	em->orig_block_len = orig_block_len;
7270	em->ram_bytes = ram_bytes;
7271	em->generation = -1;
7272	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7273	if (type == BTRFS_ORDERED_PREALLOC) {
7274		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7275	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7276		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7277		em->compress_type = compress_type;
7278	}
7279
7280	ret = btrfs_replace_extent_map_range(inode, em, true);
7281	if (ret) {
7282		free_extent_map(em);
7283		return ERR_PTR(ret);
7284	}
7285
7286	/* em got 2 refs now, callers needs to do free_extent_map once. */
7287	return em;
7288}
7289
7290
7291static int btrfs_get_blocks_direct_write(struct extent_map **map,
7292					 struct inode *inode,
7293					 struct btrfs_dio_data *dio_data,
7294					 u64 start, u64 *lenp,
7295					 unsigned int iomap_flags)
7296{
7297	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7298	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7299	struct extent_map *em = *map;
7300	int type;
7301	u64 block_start, orig_start, orig_block_len, ram_bytes;
7302	struct btrfs_block_group *bg;
7303	bool can_nocow = false;
7304	bool space_reserved = false;
7305	u64 len = *lenp;
7306	u64 prev_len;
7307	int ret = 0;
7308
7309	/*
7310	 * We don't allocate a new extent in the following cases
7311	 *
7312	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
7313	 * existing extent.
7314	 * 2) The extent is marked as PREALLOC. We're good to go here and can
7315	 * just use the extent.
7316	 *
7317	 */
7318	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7319	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7320	     em->block_start != EXTENT_MAP_HOLE)) {
7321		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7322			type = BTRFS_ORDERED_PREALLOC;
7323		else
7324			type = BTRFS_ORDERED_NOCOW;
7325		len = min(len, em->len - (start - em->start));
7326		block_start = em->block_start + (start - em->start);
7327
7328		if (can_nocow_extent(inode, start, &len, &orig_start,
7329				     &orig_block_len, &ram_bytes, false, false) == 1) {
7330			bg = btrfs_inc_nocow_writers(fs_info, block_start);
7331			if (bg)
7332				can_nocow = true;
7333		}
7334	}
7335
7336	prev_len = len;
7337	if (can_nocow) {
7338		struct extent_map *em2;
7339
7340		/* We can NOCOW, so only need to reserve metadata space. */
7341		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7342						      nowait);
7343		if (ret < 0) {
7344			/* Our caller expects us to free the input extent map. */
7345			free_extent_map(em);
7346			*map = NULL;
7347			btrfs_dec_nocow_writers(bg);
7348			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
7349				ret = -EAGAIN;
7350			goto out;
7351		}
7352		space_reserved = true;
7353
7354		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
7355					      orig_start, block_start,
7356					      len, orig_block_len,
7357					      ram_bytes, type);
7358		btrfs_dec_nocow_writers(bg);
7359		if (type == BTRFS_ORDERED_PREALLOC) {
7360			free_extent_map(em);
7361			*map = em2;
7362			em = em2;
7363		}
7364
7365		if (IS_ERR(em2)) {
7366			ret = PTR_ERR(em2);
7367			goto out;
7368		}
7369
7370		dio_data->nocow_done = true;
7371	} else {
7372		/* Our caller expects us to free the input extent map. */
7373		free_extent_map(em);
7374		*map = NULL;
7375
7376		if (nowait) {
7377			ret = -EAGAIN;
7378			goto out;
7379		}
7380
7381		/*
7382		 * If we could not allocate data space before locking the file
7383		 * range and we can't do a NOCOW write, then we have to fail.
7384		 */
7385		if (!dio_data->data_space_reserved) {
7386			ret = -ENOSPC;
7387			goto out;
7388		}
7389
7390		/*
7391		 * We have to COW and we have already reserved data space before,
7392		 * so now we reserve only metadata.
7393		 */
7394		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7395						      false);
7396		if (ret < 0)
7397			goto out;
7398		space_reserved = true;
7399
7400		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
7401		if (IS_ERR(em)) {
7402			ret = PTR_ERR(em);
7403			goto out;
7404		}
7405		*map = em;
7406		len = min(len, em->len - (start - em->start));
7407		if (len < prev_len)
7408			btrfs_delalloc_release_metadata(BTRFS_I(inode),
7409							prev_len - len, true);
7410	}
7411
7412	/*
7413	 * We have created our ordered extent, so we can now release our reservation
7414	 * for an outstanding extent.
7415	 */
7416	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
7417
7418	/*
7419	 * Need to update the i_size under the extent lock so buffered
7420	 * readers will get the updated i_size when we unlock.
7421	 */
7422	if (start + len > i_size_read(inode))
7423		i_size_write(inode, start + len);
7424out:
7425	if (ret && space_reserved) {
7426		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7427		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
7428	}
7429	*lenp = len;
7430	return ret;
7431}
7432
7433static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7434		loff_t length, unsigned int flags, struct iomap *iomap,
7435		struct iomap *srcmap)
7436{
7437	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7438	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7439	struct extent_map *em;
7440	struct extent_state *cached_state = NULL;
7441	struct btrfs_dio_data *dio_data = iter->private;
7442	u64 lockstart, lockend;
7443	const bool write = !!(flags & IOMAP_WRITE);
7444	int ret = 0;
7445	u64 len = length;
7446	const u64 data_alloc_len = length;
7447	bool unlock_extents = false;
7448
7449	/*
7450	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
7451	 * we're NOWAIT we may submit a bio for a partial range and return
7452	 * EIOCBQUEUED, which would result in an errant short read.
7453	 *
7454	 * The best way to handle this would be to allow for partial completions
7455	 * of iocb's, so we could submit the partial bio, return and fault in
7456	 * the rest of the pages, and then submit the io for the rest of the
7457	 * range.  However we don't have that currently, so simply return
7458	 * -EAGAIN at this point so that the normal path is used.
7459	 */
7460	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
7461		return -EAGAIN;
7462
7463	/*
7464	 * Cap the size of reads to that usually seen in buffered I/O as we need
7465	 * to allocate a contiguous array for the checksums.
7466	 */
7467	if (!write)
7468		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
7469
7470	lockstart = start;
7471	lockend = start + len - 1;
7472
7473	/*
7474	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
7475	 * enough if we've written compressed pages to this area, so we need to
7476	 * flush the dirty pages again to make absolutely sure that any
7477	 * outstanding dirty pages are on disk - the first flush only starts
7478	 * compression on the data, while keeping the pages locked, so by the
7479	 * time the second flush returns we know bios for the compressed pages
7480	 * were submitted and finished, and the pages no longer under writeback.
7481	 *
7482	 * If we have a NOWAIT request and we have any pages in the range that
7483	 * are locked, likely due to compression still in progress, we don't want
7484	 * to block on page locks. We also don't want to block on pages marked as
7485	 * dirty or under writeback (same as for the non-compression case).
7486	 * iomap_dio_rw() did the same check, but after that and before we got
7487	 * here, mmap'ed writes may have happened or buffered reads started
7488	 * (readpage() and readahead(), which lock pages), as we haven't locked
7489	 * the file range yet.
7490	 */
7491	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7492		     &BTRFS_I(inode)->runtime_flags)) {
7493		if (flags & IOMAP_NOWAIT) {
7494			if (filemap_range_needs_writeback(inode->i_mapping,
7495							  lockstart, lockend))
7496				return -EAGAIN;
7497		} else {
7498			ret = filemap_fdatawrite_range(inode->i_mapping, start,
7499						       start + length - 1);
7500			if (ret)
7501				return ret;
7502		}
7503	}
7504
7505	memset(dio_data, 0, sizeof(*dio_data));
7506
7507	/*
7508	 * We always try to allocate data space and must do it before locking
7509	 * the file range, to avoid deadlocks with concurrent writes to the same
7510	 * range if the range has several extents and the writes don't expand the
7511	 * current i_size (the inode lock is taken in shared mode). If we fail to
7512	 * allocate data space here we continue and later, after locking the
7513	 * file range, we fail with ENOSPC only if we figure out we can not do a
7514	 * NOCOW write.
7515	 */
7516	if (write && !(flags & IOMAP_NOWAIT)) {
7517		ret = btrfs_check_data_free_space(BTRFS_I(inode),
7518						  &dio_data->data_reserved,
7519						  start, data_alloc_len, false);
7520		if (!ret)
7521			dio_data->data_space_reserved = true;
7522		else if (ret && !(BTRFS_I(inode)->flags &
7523				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
7524			goto err;
7525	}
7526
7527	/*
7528	 * If this errors out it's because we couldn't invalidate pagecache for
7529	 * this range and we need to fallback to buffered IO, or we are doing a
7530	 * NOWAIT read/write and we need to block.
7531	 */
7532	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
7533	if (ret < 0)
7534		goto err;
7535
7536	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7537	if (IS_ERR(em)) {
7538		ret = PTR_ERR(em);
7539		goto unlock_err;
7540	}
7541
7542	/*
7543	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7544	 * io.  INLINE is special, and we could probably kludge it in here, but
7545	 * it's still buffered so for safety lets just fall back to the generic
7546	 * buffered path.
7547	 *
7548	 * For COMPRESSED we _have_ to read the entire extent in so we can
7549	 * decompress it, so there will be buffering required no matter what we
7550	 * do, so go ahead and fallback to buffered.
7551	 *
7552	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7553	 * to buffered IO.  Don't blame me, this is the price we pay for using
7554	 * the generic code.
7555	 */
7556	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7557	    em->block_start == EXTENT_MAP_INLINE) {
7558		free_extent_map(em);
7559		/*
7560		 * If we are in a NOWAIT context, return -EAGAIN in order to
7561		 * fallback to buffered IO. This is not only because we can
7562		 * block with buffered IO (no support for NOWAIT semantics at
7563		 * the moment) but also to avoid returning short reads to user
7564		 * space - this happens if we were able to read some data from
7565		 * previous non-compressed extents and then when we fallback to
7566		 * buffered IO, at btrfs_file_read_iter() by calling
7567		 * filemap_read(), we fail to fault in pages for the read buffer,
7568		 * in which case filemap_read() returns a short read (the number
7569		 * of bytes previously read is > 0, so it does not return -EFAULT).
7570		 */
7571		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7572		goto unlock_err;
7573	}
7574
7575	len = min(len, em->len - (start - em->start));
7576
7577	/*
7578	 * If we have a NOWAIT request and the range contains multiple extents
7579	 * (or a mix of extents and holes), then we return -EAGAIN to make the
7580	 * caller fallback to a context where it can do a blocking (without
7581	 * NOWAIT) request. This way we avoid doing partial IO and returning
7582	 * success to the caller, which is not optimal for writes and for reads
7583	 * it can result in unexpected behaviour for an application.
7584	 *
7585	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7586	 * iomap_dio_rw(), we can end up returning less data then what the caller
7587	 * asked for, resulting in an unexpected, and incorrect, short read.
7588	 * That is, the caller asked to read N bytes and we return less than that,
7589	 * which is wrong unless we are crossing EOF. This happens if we get a
7590	 * page fault error when trying to fault in pages for the buffer that is
7591	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
7592	 * have previously submitted bios for other extents in the range, in
7593	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7594	 * those bios have completed by the time we get the page fault error,
7595	 * which we return back to our caller - we should only return EIOCBQUEUED
7596	 * after we have submitted bios for all the extents in the range.
7597	 */
7598	if ((flags & IOMAP_NOWAIT) && len < length) {
7599		free_extent_map(em);
7600		ret = -EAGAIN;
7601		goto unlock_err;
7602	}
7603
7604	if (write) {
7605		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7606						    start, &len, flags);
7607		if (ret < 0)
7608			goto unlock_err;
7609		unlock_extents = true;
7610		/* Recalc len in case the new em is smaller than requested */
7611		len = min(len, em->len - (start - em->start));
7612		if (dio_data->data_space_reserved) {
7613			u64 release_offset;
7614			u64 release_len = 0;
7615
7616			if (dio_data->nocow_done) {
7617				release_offset = start;
7618				release_len = data_alloc_len;
7619			} else if (len < data_alloc_len) {
7620				release_offset = start + len;
7621				release_len = data_alloc_len - len;
7622			}
7623
7624			if (release_len > 0)
7625				btrfs_free_reserved_data_space(BTRFS_I(inode),
7626							       dio_data->data_reserved,
7627							       release_offset,
7628							       release_len);
7629		}
7630	} else {
7631		/*
7632		 * We need to unlock only the end area that we aren't using.
7633		 * The rest is going to be unlocked by the endio routine.
7634		 */
7635		lockstart = start + len;
7636		if (lockstart < lockend)
7637			unlock_extents = true;
7638	}
7639
7640	if (unlock_extents)
7641		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7642			      &cached_state);
7643	else
7644		free_extent_state(cached_state);
7645
7646	/*
7647	 * Translate extent map information to iomap.
7648	 * We trim the extents (and move the addr) even though iomap code does
7649	 * that, since we have locked only the parts we are performing I/O in.
7650	 */
7651	if ((em->block_start == EXTENT_MAP_HOLE) ||
7652	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7653		iomap->addr = IOMAP_NULL_ADDR;
7654		iomap->type = IOMAP_HOLE;
7655	} else {
7656		iomap->addr = em->block_start + (start - em->start);
7657		iomap->type = IOMAP_MAPPED;
7658	}
7659	iomap->offset = start;
7660	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7661	iomap->length = len;
7662	free_extent_map(em);
7663
7664	return 0;
7665
7666unlock_err:
7667	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7668		      &cached_state);
7669err:
7670	if (dio_data->data_space_reserved) {
7671		btrfs_free_reserved_data_space(BTRFS_I(inode),
7672					       dio_data->data_reserved,
7673					       start, data_alloc_len);
7674		extent_changeset_free(dio_data->data_reserved);
7675	}
7676
7677	return ret;
7678}
7679
7680static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7681		ssize_t written, unsigned int flags, struct iomap *iomap)
7682{
7683	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7684	struct btrfs_dio_data *dio_data = iter->private;
7685	size_t submitted = dio_data->submitted;
7686	const bool write = !!(flags & IOMAP_WRITE);
7687	int ret = 0;
7688
7689	if (!write && (iomap->type == IOMAP_HOLE)) {
7690		/* If reading from a hole, unlock and return */
7691		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
7692			      NULL);
7693		return 0;
7694	}
7695
7696	if (submitted < length) {
7697		pos += submitted;
7698		length -= submitted;
7699		if (write)
7700			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7701						    pos, length, false);
7702		else
7703			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7704				      pos + length - 1, NULL);
7705		ret = -ENOTBLK;
7706	}
7707	if (write) {
7708		btrfs_put_ordered_extent(dio_data->ordered);
7709		dio_data->ordered = NULL;
7710	}
7711
7712	if (write)
7713		extent_changeset_free(dio_data->data_reserved);
7714	return ret;
7715}
7716
7717static void btrfs_dio_end_io(struct btrfs_bio *bbio)
7718{
7719	struct btrfs_dio_private *dip =
7720		container_of(bbio, struct btrfs_dio_private, bbio);
7721	struct btrfs_inode *inode = bbio->inode;
7722	struct bio *bio = &bbio->bio;
7723
7724	if (bio->bi_status) {
7725		btrfs_warn(inode->root->fs_info,
7726		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
7727			   btrfs_ino(inode), bio->bi_opf,
7728			   dip->file_offset, dip->bytes, bio->bi_status);
7729	}
7730
7731	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
7732		btrfs_finish_ordered_extent(bbio->ordered, NULL,
7733					    dip->file_offset, dip->bytes,
7734					    !bio->bi_status);
7735	} else {
7736		unlock_extent(&inode->io_tree, dip->file_offset,
7737			      dip->file_offset + dip->bytes - 1, NULL);
7738	}
7739
7740	bbio->bio.bi_private = bbio->private;
7741	iomap_dio_bio_end_io(bio);
7742}
7743
7744static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
7745				loff_t file_offset)
7746{
7747	struct btrfs_bio *bbio = btrfs_bio(bio);
7748	struct btrfs_dio_private *dip =
7749		container_of(bbio, struct btrfs_dio_private, bbio);
7750	struct btrfs_dio_data *dio_data = iter->private;
7751
7752	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
7753		       btrfs_dio_end_io, bio->bi_private);
7754	bbio->inode = BTRFS_I(iter->inode);
7755	bbio->file_offset = file_offset;
7756
7757	dip->file_offset = file_offset;
7758	dip->bytes = bio->bi_iter.bi_size;
7759
7760	dio_data->submitted += bio->bi_iter.bi_size;
7761
7762	/*
7763	 * Check if we are doing a partial write.  If we are, we need to split
7764	 * the ordered extent to match the submitted bio.  Hang on to the
7765	 * remaining unfinishable ordered_extent in dio_data so that it can be
7766	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
7767	 * remaining pages is blocked on the outstanding ordered extent.
7768	 */
7769	if (iter->flags & IOMAP_WRITE) {
7770		int ret;
7771
7772		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
7773		if (ret) {
7774			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7775						    file_offset, dip->bytes,
7776						    !ret);
7777			bio->bi_status = errno_to_blk_status(ret);
7778			iomap_dio_bio_end_io(bio);
7779			return;
7780		}
7781	}
7782
7783	btrfs_submit_bio(bbio, 0);
7784}
7785
7786static const struct iomap_ops btrfs_dio_iomap_ops = {
7787	.iomap_begin            = btrfs_dio_iomap_begin,
7788	.iomap_end              = btrfs_dio_iomap_end,
7789};
7790
7791static const struct iomap_dio_ops btrfs_dio_ops = {
7792	.submit_io		= btrfs_dio_submit_io,
7793	.bio_set		= &btrfs_dio_bioset,
7794};
7795
7796ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
7797{
7798	struct btrfs_dio_data data = { 0 };
7799
7800	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7801			    IOMAP_DIO_PARTIAL, &data, done_before);
7802}
7803
7804struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
7805				  size_t done_before)
7806{
7807	struct btrfs_dio_data data = { 0 };
7808
7809	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7810			    IOMAP_DIO_PARTIAL, &data, done_before);
7811}
7812
7813static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7814			u64 start, u64 len)
7815{
7816	int	ret;
7817
7818	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
7819	if (ret)
7820		return ret;
7821
7822	/*
7823	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
7824	 * file range (0 to LLONG_MAX), but that is not enough if we have
7825	 * compression enabled. The first filemap_fdatawrite_range() only kicks
7826	 * in the compression of data (in an async thread) and will return
7827	 * before the compression is done and writeback is started. A second
7828	 * filemap_fdatawrite_range() is needed to wait for the compression to
7829	 * complete and writeback to start. We also need to wait for ordered
7830	 * extents to complete, because our fiemap implementation uses mainly
7831	 * file extent items to list the extents, searching for extent maps
7832	 * only for file ranges with holes or prealloc extents to figure out
7833	 * if we have delalloc in those ranges.
7834	 */
7835	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7836		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7837		if (ret)
7838			return ret;
7839	}
7840
7841	return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
7842}
7843
7844static int btrfs_writepages(struct address_space *mapping,
7845			    struct writeback_control *wbc)
7846{
7847	return extent_writepages(mapping, wbc);
7848}
7849
7850static void btrfs_readahead(struct readahead_control *rac)
7851{
7852	extent_readahead(rac);
7853}
7854
7855/*
7856 * For release_folio() and invalidate_folio() we have a race window where
7857 * folio_end_writeback() is called but the subpage spinlock is not yet released.
7858 * If we continue to release/invalidate the page, we could cause use-after-free
7859 * for subpage spinlock.  So this function is to spin and wait for subpage
7860 * spinlock.
7861 */
7862static void wait_subpage_spinlock(struct page *page)
7863{
7864	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7865	struct btrfs_subpage *subpage;
7866
7867	if (!btrfs_is_subpage(fs_info, page))
7868		return;
7869
7870	ASSERT(PagePrivate(page) && page->private);
7871	subpage = (struct btrfs_subpage *)page->private;
7872
7873	/*
7874	 * This may look insane as we just acquire the spinlock and release it,
7875	 * without doing anything.  But we just want to make sure no one is
7876	 * still holding the subpage spinlock.
7877	 * And since the page is not dirty nor writeback, and we have page
7878	 * locked, the only possible way to hold a spinlock is from the endio
7879	 * function to clear page writeback.
7880	 *
7881	 * Here we just acquire the spinlock so that all existing callers
7882	 * should exit and we're safe to release/invalidate the page.
7883	 */
7884	spin_lock_irq(&subpage->lock);
7885	spin_unlock_irq(&subpage->lock);
7886}
7887
7888static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7889{
7890	int ret = try_release_extent_mapping(&folio->page, gfp_flags);
7891
7892	if (ret == 1) {
7893		wait_subpage_spinlock(&folio->page);
7894		clear_page_extent_mapped(&folio->page);
7895	}
7896	return ret;
7897}
7898
7899static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7900{
7901	if (folio_test_writeback(folio) || folio_test_dirty(folio))
7902		return false;
7903	return __btrfs_release_folio(folio, gfp_flags);
7904}
7905
7906#ifdef CONFIG_MIGRATION
7907static int btrfs_migrate_folio(struct address_space *mapping,
7908			     struct folio *dst, struct folio *src,
7909			     enum migrate_mode mode)
7910{
7911	int ret = filemap_migrate_folio(mapping, dst, src, mode);
7912
7913	if (ret != MIGRATEPAGE_SUCCESS)
7914		return ret;
7915
7916	if (folio_test_ordered(src)) {
7917		folio_clear_ordered(src);
7918		folio_set_ordered(dst);
7919	}
7920
7921	return MIGRATEPAGE_SUCCESS;
7922}
7923#else
7924#define btrfs_migrate_folio NULL
7925#endif
7926
7927static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7928				 size_t length)
7929{
7930	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
7931	struct btrfs_fs_info *fs_info = inode->root->fs_info;
7932	struct extent_io_tree *tree = &inode->io_tree;
7933	struct extent_state *cached_state = NULL;
7934	u64 page_start = folio_pos(folio);
7935	u64 page_end = page_start + folio_size(folio) - 1;
7936	u64 cur;
7937	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7938
7939	/*
7940	 * We have folio locked so no new ordered extent can be created on this
7941	 * page, nor bio can be submitted for this folio.
7942	 *
7943	 * But already submitted bio can still be finished on this folio.
7944	 * Furthermore, endio function won't skip folio which has Ordered
7945	 * (Private2) already cleared, so it's possible for endio and
7946	 * invalidate_folio to do the same ordered extent accounting twice
7947	 * on one folio.
7948	 *
7949	 * So here we wait for any submitted bios to finish, so that we won't
7950	 * do double ordered extent accounting on the same folio.
7951	 */
7952	folio_wait_writeback(folio);
7953	wait_subpage_spinlock(&folio->page);
7954
7955	/*
7956	 * For subpage case, we have call sites like
7957	 * btrfs_punch_hole_lock_range() which passes range not aligned to
7958	 * sectorsize.
7959	 * If the range doesn't cover the full folio, we don't need to and
7960	 * shouldn't clear page extent mapped, as folio->private can still
7961	 * record subpage dirty bits for other part of the range.
7962	 *
7963	 * For cases that invalidate the full folio even the range doesn't
7964	 * cover the full folio, like invalidating the last folio, we're
7965	 * still safe to wait for ordered extent to finish.
7966	 */
7967	if (!(offset == 0 && length == folio_size(folio))) {
7968		btrfs_release_folio(folio, GFP_NOFS);
7969		return;
7970	}
7971
7972	if (!inode_evicting)
7973		lock_extent(tree, page_start, page_end, &cached_state);
7974
7975	cur = page_start;
7976	while (cur < page_end) {
7977		struct btrfs_ordered_extent *ordered;
7978		u64 range_end;
7979		u32 range_len;
7980		u32 extra_flags = 0;
7981
7982		ordered = btrfs_lookup_first_ordered_range(inode, cur,
7983							   page_end + 1 - cur);
7984		if (!ordered) {
7985			range_end = page_end;
7986			/*
7987			 * No ordered extent covering this range, we are safe
7988			 * to delete all extent states in the range.
7989			 */
7990			extra_flags = EXTENT_CLEAR_ALL_BITS;
7991			goto next;
7992		}
7993		if (ordered->file_offset > cur) {
7994			/*
7995			 * There is a range between [cur, oe->file_offset) not
7996			 * covered by any ordered extent.
7997			 * We are safe to delete all extent states, and handle
7998			 * the ordered extent in the next iteration.
7999			 */
8000			range_end = ordered->file_offset - 1;
8001			extra_flags = EXTENT_CLEAR_ALL_BITS;
8002			goto next;
8003		}
8004
8005		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8006				page_end);
8007		ASSERT(range_end + 1 - cur < U32_MAX);
8008		range_len = range_end + 1 - cur;
8009		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
8010			/*
8011			 * If Ordered (Private2) is cleared, it means endio has
8012			 * already been executed for the range.
8013			 * We can't delete the extent states as
8014			 * btrfs_finish_ordered_io() may still use some of them.
8015			 */
8016			goto next;
8017		}
8018		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
8019
8020		/*
8021		 * IO on this page will never be started, so we need to account
8022		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8023		 * here, must leave that up for the ordered extent completion.
8024		 *
8025		 * This will also unlock the range for incoming
8026		 * btrfs_finish_ordered_io().
8027		 */
8028		if (!inode_evicting)
8029			clear_extent_bit(tree, cur, range_end,
8030					 EXTENT_DELALLOC |
8031					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8032					 EXTENT_DEFRAG, &cached_state);
8033
8034		spin_lock_irq(&inode->ordered_tree.lock);
8035		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8036		ordered->truncated_len = min(ordered->truncated_len,
8037					     cur - ordered->file_offset);
8038		spin_unlock_irq(&inode->ordered_tree.lock);
8039
8040		/*
8041		 * If the ordered extent has finished, we're safe to delete all
8042		 * the extent states of the range, otherwise
8043		 * btrfs_finish_ordered_io() will get executed by endio for
8044		 * other pages, so we can't delete extent states.
8045		 */
8046		if (btrfs_dec_test_ordered_pending(inode, &ordered,
8047						   cur, range_end + 1 - cur)) {
8048			btrfs_finish_ordered_io(ordered);
8049			/*
8050			 * The ordered extent has finished, now we're again
8051			 * safe to delete all extent states of the range.
8052			 */
8053			extra_flags = EXTENT_CLEAR_ALL_BITS;
8054		}
8055next:
8056		if (ordered)
8057			btrfs_put_ordered_extent(ordered);
8058		/*
8059		 * Qgroup reserved space handler
8060		 * Sector(s) here will be either:
8061		 *
8062		 * 1) Already written to disk or bio already finished
8063		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8064		 *    Qgroup will be handled by its qgroup_record then.
8065		 *    btrfs_qgroup_free_data() call will do nothing here.
8066		 *
8067		 * 2) Not written to disk yet
8068		 *    Then btrfs_qgroup_free_data() call will clear the
8069		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8070		 *    reserved data space.
8071		 *    Since the IO will never happen for this page.
8072		 */
8073		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
8074		if (!inode_evicting) {
8075			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8076				 EXTENT_DELALLOC | EXTENT_UPTODATE |
8077				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
8078				 extra_flags, &cached_state);
8079		}
8080		cur = range_end + 1;
8081	}
8082	/*
8083	 * We have iterated through all ordered extents of the page, the page
8084	 * should not have Ordered (Private2) anymore, or the above iteration
8085	 * did something wrong.
8086	 */
8087	ASSERT(!folio_test_ordered(folio));
8088	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
8089	if (!inode_evicting)
8090		__btrfs_release_folio(folio, GFP_NOFS);
8091	clear_page_extent_mapped(&folio->page);
8092}
8093
8094/*
8095 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8096 * called from a page fault handler when a page is first dirtied. Hence we must
8097 * be careful to check for EOF conditions here. We set the page up correctly
8098 * for a written page which means we get ENOSPC checking when writing into
8099 * holes and correct delalloc and unwritten extent mapping on filesystems that
8100 * support these features.
8101 *
8102 * We are not allowed to take the i_mutex here so we have to play games to
8103 * protect against truncate races as the page could now be beyond EOF.  Because
8104 * truncate_setsize() writes the inode size before removing pages, once we have
8105 * the page lock we can determine safely if the page is beyond EOF. If it is not
8106 * beyond EOF, then the page is guaranteed safe against truncation until we
8107 * unlock the page.
8108 */
8109vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8110{
8111	struct page *page = vmf->page;
8112	struct inode *inode = file_inode(vmf->vma->vm_file);
8113	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8114	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8115	struct btrfs_ordered_extent *ordered;
8116	struct extent_state *cached_state = NULL;
8117	struct extent_changeset *data_reserved = NULL;
8118	unsigned long zero_start;
8119	loff_t size;
8120	vm_fault_t ret;
8121	int ret2;
8122	int reserved = 0;
8123	u64 reserved_space;
8124	u64 page_start;
8125	u64 page_end;
8126	u64 end;
8127
8128	reserved_space = PAGE_SIZE;
8129
8130	sb_start_pagefault(inode->i_sb);
8131	page_start = page_offset(page);
8132	page_end = page_start + PAGE_SIZE - 1;
8133	end = page_end;
8134
8135	/*
8136	 * Reserving delalloc space after obtaining the page lock can lead to
8137	 * deadlock. For example, if a dirty page is locked by this function
8138	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8139	 * dirty page write out, then the btrfs_writepages() function could
8140	 * end up waiting indefinitely to get a lock on the page currently
8141	 * being processed by btrfs_page_mkwrite() function.
8142	 */
8143	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8144					    page_start, reserved_space);
8145	if (!ret2) {
8146		ret2 = file_update_time(vmf->vma->vm_file);
8147		reserved = 1;
8148	}
8149	if (ret2) {
8150		ret = vmf_error(ret2);
8151		if (reserved)
8152			goto out;
8153		goto out_noreserve;
8154	}
8155
8156	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8157again:
8158	down_read(&BTRFS_I(inode)->i_mmap_lock);
8159	lock_page(page);
8160	size = i_size_read(inode);
8161
8162	if ((page->mapping != inode->i_mapping) ||
8163	    (page_start >= size)) {
8164		/* page got truncated out from underneath us */
8165		goto out_unlock;
8166	}
8167	wait_on_page_writeback(page);
8168
8169	lock_extent(io_tree, page_start, page_end, &cached_state);
8170	ret2 = set_page_extent_mapped(page);
8171	if (ret2 < 0) {
8172		ret = vmf_error(ret2);
8173		unlock_extent(io_tree, page_start, page_end, &cached_state);
8174		goto out_unlock;
8175	}
8176
8177	/*
8178	 * we can't set the delalloc bits if there are pending ordered
8179	 * extents.  Drop our locks and wait for them to finish
8180	 */
8181	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8182			PAGE_SIZE);
8183	if (ordered) {
8184		unlock_extent(io_tree, page_start, page_end, &cached_state);
8185		unlock_page(page);
8186		up_read(&BTRFS_I(inode)->i_mmap_lock);
8187		btrfs_start_ordered_extent(ordered);
8188		btrfs_put_ordered_extent(ordered);
8189		goto again;
8190	}
8191
8192	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8193		reserved_space = round_up(size - page_start,
8194					  fs_info->sectorsize);
8195		if (reserved_space < PAGE_SIZE) {
8196			end = page_start + reserved_space - 1;
8197			btrfs_delalloc_release_space(BTRFS_I(inode),
8198					data_reserved, page_start,
8199					PAGE_SIZE - reserved_space, true);
8200		}
8201	}
8202
8203	/*
8204	 * page_mkwrite gets called when the page is firstly dirtied after it's
8205	 * faulted in, but write(2) could also dirty a page and set delalloc
8206	 * bits, thus in this case for space account reason, we still need to
8207	 * clear any delalloc bits within this page range since we have to
8208	 * reserve data&meta space before lock_page() (see above comments).
8209	 */
8210	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8211			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8212			  EXTENT_DEFRAG, &cached_state);
8213
8214	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8215					&cached_state);
8216	if (ret2) {
8217		unlock_extent(io_tree, page_start, page_end, &cached_state);
8218		ret = VM_FAULT_SIGBUS;
8219		goto out_unlock;
8220	}
8221
8222	/* page is wholly or partially inside EOF */
8223	if (page_start + PAGE_SIZE > size)
8224		zero_start = offset_in_page(size);
8225	else
8226		zero_start = PAGE_SIZE;
8227
8228	if (zero_start != PAGE_SIZE)
8229		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8230
8231	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8232	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8233	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8234
8235	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8236
8237	unlock_extent(io_tree, page_start, page_end, &cached_state);
8238	up_read(&BTRFS_I(inode)->i_mmap_lock);
8239
8240	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8241	sb_end_pagefault(inode->i_sb);
8242	extent_changeset_free(data_reserved);
8243	return VM_FAULT_LOCKED;
8244
8245out_unlock:
8246	unlock_page(page);
8247	up_read(&BTRFS_I(inode)->i_mmap_lock);
8248out:
8249	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8250	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8251				     reserved_space, (ret != 0));
8252out_noreserve:
8253	sb_end_pagefault(inode->i_sb);
8254	extent_changeset_free(data_reserved);
8255	return ret;
8256}
8257
8258static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
8259{
8260	struct btrfs_truncate_control control = {
8261		.inode = inode,
8262		.ino = btrfs_ino(inode),
8263		.min_type = BTRFS_EXTENT_DATA_KEY,
8264		.clear_extent_range = true,
8265	};
8266	struct btrfs_root *root = inode->root;
8267	struct btrfs_fs_info *fs_info = root->fs_info;
8268	struct btrfs_block_rsv *rsv;
8269	int ret;
8270	struct btrfs_trans_handle *trans;
8271	u64 mask = fs_info->sectorsize - 1;
8272	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8273
8274	if (!skip_writeback) {
8275		ret = btrfs_wait_ordered_range(&inode->vfs_inode,
8276					       inode->vfs_inode.i_size & (~mask),
8277					       (u64)-1);
8278		if (ret)
8279			return ret;
8280	}
8281
8282	/*
8283	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8284	 * things going on here:
8285	 *
8286	 * 1) We need to reserve space to update our inode.
8287	 *
8288	 * 2) We need to have something to cache all the space that is going to
8289	 * be free'd up by the truncate operation, but also have some slack
8290	 * space reserved in case it uses space during the truncate (thank you
8291	 * very much snapshotting).
8292	 *
8293	 * And we need these to be separate.  The fact is we can use a lot of
8294	 * space doing the truncate, and we have no earthly idea how much space
8295	 * we will use, so we need the truncate reservation to be separate so it
8296	 * doesn't end up using space reserved for updating the inode.  We also
8297	 * need to be able to stop the transaction and start a new one, which
8298	 * means we need to be able to update the inode several times, and we
8299	 * have no idea of knowing how many times that will be, so we can't just
8300	 * reserve 1 item for the entirety of the operation, so that has to be
8301	 * done separately as well.
8302	 *
8303	 * So that leaves us with
8304	 *
8305	 * 1) rsv - for the truncate reservation, which we will steal from the
8306	 * transaction reservation.
8307	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8308	 * updating the inode.
8309	 */
8310	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8311	if (!rsv)
8312		return -ENOMEM;
8313	rsv->size = min_size;
8314	rsv->failfast = true;
8315
8316	/*
8317	 * 1 for the truncate slack space
8318	 * 1 for updating the inode.
8319	 */
8320	trans = btrfs_start_transaction(root, 2);
8321	if (IS_ERR(trans)) {
8322		ret = PTR_ERR(trans);
8323		goto out;
8324	}
8325
8326	/* Migrate the slack space for the truncate to our reserve */
8327	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8328				      min_size, false);
8329	/*
8330	 * We have reserved 2 metadata units when we started the transaction and
8331	 * min_size matches 1 unit, so this should never fail, but if it does,
8332	 * it's not critical we just fail truncation.
8333	 */
8334	if (WARN_ON(ret)) {
8335		btrfs_end_transaction(trans);
8336		goto out;
8337	}
8338
8339	trans->block_rsv = rsv;
8340
8341	while (1) {
8342		struct extent_state *cached_state = NULL;
8343		const u64 new_size = inode->vfs_inode.i_size;
8344		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8345
8346		control.new_size = new_size;
8347		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8348		/*
8349		 * We want to drop from the next block forward in case this new
8350		 * size is not block aligned since we will be keeping the last
8351		 * block of the extent just the way it is.
8352		 */
8353		btrfs_drop_extent_map_range(inode,
8354					    ALIGN(new_size, fs_info->sectorsize),
8355					    (u64)-1, false);
8356
8357		ret = btrfs_truncate_inode_items(trans, root, &control);
8358
8359		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
8360		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
8361
8362		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8363
8364		trans->block_rsv = &fs_info->trans_block_rsv;
8365		if (ret != -ENOSPC && ret != -EAGAIN)
8366			break;
8367
8368		ret = btrfs_update_inode(trans, root, inode);
8369		if (ret)
8370			break;
8371
8372		btrfs_end_transaction(trans);
8373		btrfs_btree_balance_dirty(fs_info);
8374
8375		trans = btrfs_start_transaction(root, 2);
8376		if (IS_ERR(trans)) {
8377			ret = PTR_ERR(trans);
8378			trans = NULL;
8379			break;
8380		}
8381
8382		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8383		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8384					      rsv, min_size, false);
8385		/*
8386		 * We have reserved 2 metadata units when we started the
8387		 * transaction and min_size matches 1 unit, so this should never
8388		 * fail, but if it does, it's not critical we just fail truncation.
8389		 */
8390		if (WARN_ON(ret))
8391			break;
8392
8393		trans->block_rsv = rsv;
8394	}
8395
8396	/*
8397	 * We can't call btrfs_truncate_block inside a trans handle as we could
8398	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8399	 * know we've truncated everything except the last little bit, and can
8400	 * do btrfs_truncate_block and then update the disk_i_size.
8401	 */
8402	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8403		btrfs_end_transaction(trans);
8404		btrfs_btree_balance_dirty(fs_info);
8405
8406		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
8407		if (ret)
8408			goto out;
8409		trans = btrfs_start_transaction(root, 1);
8410		if (IS_ERR(trans)) {
8411			ret = PTR_ERR(trans);
8412			goto out;
8413		}
8414		btrfs_inode_safe_disk_i_size_write(inode, 0);
8415	}
8416
8417	if (trans) {
8418		int ret2;
8419
8420		trans->block_rsv = &fs_info->trans_block_rsv;
8421		ret2 = btrfs_update_inode(trans, root, inode);
8422		if (ret2 && !ret)
8423			ret = ret2;
8424
8425		ret2 = btrfs_end_transaction(trans);
8426		if (ret2 && !ret)
8427			ret = ret2;
8428		btrfs_btree_balance_dirty(fs_info);
8429	}
8430out:
8431	btrfs_free_block_rsv(fs_info, rsv);
8432	/*
8433	 * So if we truncate and then write and fsync we normally would just
8434	 * write the extents that changed, which is a problem if we need to
8435	 * first truncate that entire inode.  So set this flag so we write out
8436	 * all of the extents in the inode to the sync log so we're completely
8437	 * safe.
8438	 *
8439	 * If no extents were dropped or trimmed we don't need to force the next
8440	 * fsync to truncate all the inode's items from the log and re-log them
8441	 * all. This means the truncate operation did not change the file size,
8442	 * or changed it to a smaller size but there was only an implicit hole
8443	 * between the old i_size and the new i_size, and there were no prealloc
8444	 * extents beyond i_size to drop.
8445	 */
8446	if (control.extents_found > 0)
8447		btrfs_set_inode_full_sync(inode);
8448
8449	return ret;
8450}
8451
8452struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
8453				     struct inode *dir)
8454{
8455	struct inode *inode;
8456
8457	inode = new_inode(dir->i_sb);
8458	if (inode) {
8459		/*
8460		 * Subvolumes don't inherit the sgid bit or the parent's gid if
8461		 * the parent's sgid bit is set. This is probably a bug.
8462		 */
8463		inode_init_owner(idmap, inode, NULL,
8464				 S_IFDIR | (~current_umask() & S_IRWXUGO));
8465		inode->i_op = &btrfs_dir_inode_operations;
8466		inode->i_fop = &btrfs_dir_file_operations;
8467	}
8468	return inode;
8469}
8470
8471struct inode *btrfs_alloc_inode(struct super_block *sb)
8472{
8473	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8474	struct btrfs_inode *ei;
8475	struct inode *inode;
8476
8477	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
8478	if (!ei)
8479		return NULL;
8480
8481	ei->root = NULL;
8482	ei->generation = 0;
8483	ei->last_trans = 0;
8484	ei->last_sub_trans = 0;
8485	ei->logged_trans = 0;
8486	ei->delalloc_bytes = 0;
8487	ei->new_delalloc_bytes = 0;
8488	ei->defrag_bytes = 0;
8489	ei->disk_i_size = 0;
8490	ei->flags = 0;
8491	ei->ro_flags = 0;
8492	ei->csum_bytes = 0;
8493	ei->index_cnt = (u64)-1;
8494	ei->dir_index = 0;
8495	ei->last_unlink_trans = 0;
8496	ei->last_reflink_trans = 0;
8497	ei->last_log_commit = 0;
8498
8499	spin_lock_init(&ei->lock);
8500	ei->outstanding_extents = 0;
8501	if (sb->s_magic != BTRFS_TEST_MAGIC)
8502		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
8503					      BTRFS_BLOCK_RSV_DELALLOC);
8504	ei->runtime_flags = 0;
8505	ei->prop_compress = BTRFS_COMPRESS_NONE;
8506	ei->defrag_compress = BTRFS_COMPRESS_NONE;
8507
8508	ei->delayed_node = NULL;
8509
8510	ei->i_otime.tv_sec = 0;
8511	ei->i_otime.tv_nsec = 0;
8512
8513	inode = &ei->vfs_inode;
8514	extent_map_tree_init(&ei->extent_tree);
8515	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
8516	ei->io_tree.inode = ei;
8517	extent_io_tree_init(fs_info, &ei->file_extent_tree,
8518			    IO_TREE_INODE_FILE_EXTENT);
8519	mutex_init(&ei->log_mutex);
8520	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8521	INIT_LIST_HEAD(&ei->delalloc_inodes);
8522	INIT_LIST_HEAD(&ei->delayed_iput);
8523	RB_CLEAR_NODE(&ei->rb_node);
8524	init_rwsem(&ei->i_mmap_lock);
8525
8526	return inode;
8527}
8528
8529#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8530void btrfs_test_destroy_inode(struct inode *inode)
8531{
8532	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
8533	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8534}
8535#endif
8536
8537void btrfs_free_inode(struct inode *inode)
8538{
8539	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8540}
8541
8542void btrfs_destroy_inode(struct inode *vfs_inode)
8543{
8544	struct btrfs_ordered_extent *ordered;
8545	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8546	struct btrfs_root *root = inode->root;
8547	bool freespace_inode;
8548
8549	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8550	WARN_ON(vfs_inode->i_data.nrpages);
8551	WARN_ON(inode->block_rsv.reserved);
8552	WARN_ON(inode->block_rsv.size);
8553	WARN_ON(inode->outstanding_extents);
8554	if (!S_ISDIR(vfs_inode->i_mode)) {
8555		WARN_ON(inode->delalloc_bytes);
8556		WARN_ON(inode->new_delalloc_bytes);
8557	}
8558	WARN_ON(inode->csum_bytes);
8559	WARN_ON(inode->defrag_bytes);
8560
8561	/*
8562	 * This can happen where we create an inode, but somebody else also
8563	 * created the same inode and we need to destroy the one we already
8564	 * created.
8565	 */
8566	if (!root)
8567		return;
8568
8569	/*
8570	 * If this is a free space inode do not take the ordered extents lockdep
8571	 * map.
8572	 */
8573	freespace_inode = btrfs_is_free_space_inode(inode);
8574
8575	while (1) {
8576		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8577		if (!ordered)
8578			break;
8579		else {
8580			btrfs_err(root->fs_info,
8581				  "found ordered extent %llu %llu on inode cleanup",
8582				  ordered->file_offset, ordered->num_bytes);
8583
8584			if (!freespace_inode)
8585				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
8586
8587			btrfs_remove_ordered_extent(inode, ordered);
8588			btrfs_put_ordered_extent(ordered);
8589			btrfs_put_ordered_extent(ordered);
8590		}
8591	}
8592	btrfs_qgroup_check_reserved_leak(inode);
8593	inode_tree_del(inode);
8594	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
8595	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8596	btrfs_put_root(inode->root);
8597}
8598
8599int btrfs_drop_inode(struct inode *inode)
8600{
8601	struct btrfs_root *root = BTRFS_I(inode)->root;
8602
8603	if (root == NULL)
8604		return 1;
8605
8606	/* the snap/subvol tree is on deleting */
8607	if (btrfs_root_refs(&root->root_item) == 0)
8608		return 1;
8609	else
8610		return generic_drop_inode(inode);
8611}
8612
8613static void init_once(void *foo)
8614{
8615	struct btrfs_inode *ei = foo;
8616
8617	inode_init_once(&ei->vfs_inode);
8618}
8619
8620void __cold btrfs_destroy_cachep(void)
8621{
8622	/*
8623	 * Make sure all delayed rcu free inodes are flushed before we
8624	 * destroy cache.
8625	 */
8626	rcu_barrier();
8627	bioset_exit(&btrfs_dio_bioset);
8628	kmem_cache_destroy(btrfs_inode_cachep);
8629}
8630
8631int __init btrfs_init_cachep(void)
8632{
8633	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8634			sizeof(struct btrfs_inode), 0,
8635			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
8636			init_once);
8637	if (!btrfs_inode_cachep)
8638		goto fail;
8639
8640	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
8641			offsetof(struct btrfs_dio_private, bbio.bio),
8642			BIOSET_NEED_BVECS))
8643		goto fail;
8644
8645	return 0;
8646fail:
8647	btrfs_destroy_cachep();
8648	return -ENOMEM;
8649}
8650
8651static int btrfs_getattr(struct mnt_idmap *idmap,
8652			 const struct path *path, struct kstat *stat,
8653			 u32 request_mask, unsigned int flags)
8654{
8655	u64 delalloc_bytes;
8656	u64 inode_bytes;
8657	struct inode *inode = d_inode(path->dentry);
8658	u32 blocksize = inode->i_sb->s_blocksize;
8659	u32 bi_flags = BTRFS_I(inode)->flags;
8660	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8661
8662	stat->result_mask |= STATX_BTIME;
8663	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
8664	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
8665	if (bi_flags & BTRFS_INODE_APPEND)
8666		stat->attributes |= STATX_ATTR_APPEND;
8667	if (bi_flags & BTRFS_INODE_COMPRESS)
8668		stat->attributes |= STATX_ATTR_COMPRESSED;
8669	if (bi_flags & BTRFS_INODE_IMMUTABLE)
8670		stat->attributes |= STATX_ATTR_IMMUTABLE;
8671	if (bi_flags & BTRFS_INODE_NODUMP)
8672		stat->attributes |= STATX_ATTR_NODUMP;
8673	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8674		stat->attributes |= STATX_ATTR_VERITY;
8675
8676	stat->attributes_mask |= (STATX_ATTR_APPEND |
8677				  STATX_ATTR_COMPRESSED |
8678				  STATX_ATTR_IMMUTABLE |
8679				  STATX_ATTR_NODUMP);
8680
8681	generic_fillattr(idmap, request_mask, inode, stat);
8682	stat->dev = BTRFS_I(inode)->root->anon_dev;
8683
8684	spin_lock(&BTRFS_I(inode)->lock);
8685	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8686	inode_bytes = inode_get_bytes(inode);
8687	spin_unlock(&BTRFS_I(inode)->lock);
8688	stat->blocks = (ALIGN(inode_bytes, blocksize) +
8689			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8690	return 0;
8691}
8692
8693static int btrfs_rename_exchange(struct inode *old_dir,
8694			      struct dentry *old_dentry,
8695			      struct inode *new_dir,
8696			      struct dentry *new_dentry)
8697{
8698	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8699	struct btrfs_trans_handle *trans;
8700	unsigned int trans_num_items;
8701	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8702	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8703	struct inode *new_inode = new_dentry->d_inode;
8704	struct inode *old_inode = old_dentry->d_inode;
8705	struct btrfs_rename_ctx old_rename_ctx;
8706	struct btrfs_rename_ctx new_rename_ctx;
8707	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8708	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8709	u64 old_idx = 0;
8710	u64 new_idx = 0;
8711	int ret;
8712	int ret2;
8713	bool need_abort = false;
8714	struct fscrypt_name old_fname, new_fname;
8715	struct fscrypt_str *old_name, *new_name;
8716
8717	/*
8718	 * For non-subvolumes allow exchange only within one subvolume, in the
8719	 * same inode namespace. Two subvolumes (represented as directory) can
8720	 * be exchanged as they're a logical link and have a fixed inode number.
8721	 */
8722	if (root != dest &&
8723	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8724	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
8725		return -EXDEV;
8726
8727	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8728	if (ret)
8729		return ret;
8730
8731	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8732	if (ret) {
8733		fscrypt_free_filename(&old_fname);
8734		return ret;
8735	}
8736
8737	old_name = &old_fname.disk_name;
8738	new_name = &new_fname.disk_name;
8739
8740	/* close the race window with snapshot create/destroy ioctl */
8741	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8742	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
8743		down_read(&fs_info->subvol_sem);
8744
8745	/*
8746	 * For each inode:
8747	 * 1 to remove old dir item
8748	 * 1 to remove old dir index
8749	 * 1 to add new dir item
8750	 * 1 to add new dir index
8751	 * 1 to update parent inode
8752	 *
8753	 * If the parents are the same, we only need to account for one
8754	 */
8755	trans_num_items = (old_dir == new_dir ? 9 : 10);
8756	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8757		/*
8758		 * 1 to remove old root ref
8759		 * 1 to remove old root backref
8760		 * 1 to add new root ref
8761		 * 1 to add new root backref
8762		 */
8763		trans_num_items += 4;
8764	} else {
8765		/*
8766		 * 1 to update inode item
8767		 * 1 to remove old inode ref
8768		 * 1 to add new inode ref
8769		 */
8770		trans_num_items += 3;
8771	}
8772	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8773		trans_num_items += 4;
8774	else
8775		trans_num_items += 3;
8776	trans = btrfs_start_transaction(root, trans_num_items);
8777	if (IS_ERR(trans)) {
8778		ret = PTR_ERR(trans);
8779		goto out_notrans;
8780	}
8781
8782	if (dest != root) {
8783		ret = btrfs_record_root_in_trans(trans, dest);
8784		if (ret)
8785			goto out_fail;
8786	}
8787
8788	/*
8789	 * We need to find a free sequence number both in the source and
8790	 * in the destination directory for the exchange.
8791	 */
8792	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8793	if (ret)
8794		goto out_fail;
8795	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8796	if (ret)
8797		goto out_fail;
8798
8799	BTRFS_I(old_inode)->dir_index = 0ULL;
8800	BTRFS_I(new_inode)->dir_index = 0ULL;
8801
8802	/* Reference for the source. */
8803	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8804		/* force full log commit if subvolume involved. */
8805		btrfs_set_log_full_commit(trans);
8806	} else {
8807		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8808					     btrfs_ino(BTRFS_I(new_dir)),
8809					     old_idx);
8810		if (ret)
8811			goto out_fail;
8812		need_abort = true;
8813	}
8814
8815	/* And now for the dest. */
8816	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8817		/* force full log commit if subvolume involved. */
8818		btrfs_set_log_full_commit(trans);
8819	} else {
8820		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8821					     btrfs_ino(BTRFS_I(old_dir)),
8822					     new_idx);
8823		if (ret) {
8824			if (need_abort)
8825				btrfs_abort_transaction(trans, ret);
8826			goto out_fail;
8827		}
8828	}
8829
8830	/* Update inode version and ctime/mtime. */
8831	inode_inc_iversion(old_dir);
8832	inode_inc_iversion(new_dir);
8833	inode_inc_iversion(old_inode);
8834	inode_inc_iversion(new_inode);
8835	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8836
8837	if (old_dentry->d_parent != new_dentry->d_parent) {
8838		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8839					BTRFS_I(old_inode), true);
8840		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8841					BTRFS_I(new_inode), true);
8842	}
8843
8844	/* src is a subvolume */
8845	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8846		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8847	} else { /* src is an inode */
8848		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8849					   BTRFS_I(old_dentry->d_inode),
8850					   old_name, &old_rename_ctx);
8851		if (!ret)
8852			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
8853	}
8854	if (ret) {
8855		btrfs_abort_transaction(trans, ret);
8856		goto out_fail;
8857	}
8858
8859	/* dest is a subvolume */
8860	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8861		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8862	} else { /* dest is an inode */
8863		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8864					   BTRFS_I(new_dentry->d_inode),
8865					   new_name, &new_rename_ctx);
8866		if (!ret)
8867			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
8868	}
8869	if (ret) {
8870		btrfs_abort_transaction(trans, ret);
8871		goto out_fail;
8872	}
8873
8874	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8875			     new_name, 0, old_idx);
8876	if (ret) {
8877		btrfs_abort_transaction(trans, ret);
8878		goto out_fail;
8879	}
8880
8881	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8882			     old_name, 0, new_idx);
8883	if (ret) {
8884		btrfs_abort_transaction(trans, ret);
8885		goto out_fail;
8886	}
8887
8888	if (old_inode->i_nlink == 1)
8889		BTRFS_I(old_inode)->dir_index = old_idx;
8890	if (new_inode->i_nlink == 1)
8891		BTRFS_I(new_inode)->dir_index = new_idx;
8892
8893	/*
8894	 * Now pin the logs of the roots. We do it to ensure that no other task
8895	 * can sync the logs while we are in progress with the rename, because
8896	 * that could result in an inconsistency in case any of the inodes that
8897	 * are part of this rename operation were logged before.
8898	 */
8899	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8900		btrfs_pin_log_trans(root);
8901	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8902		btrfs_pin_log_trans(dest);
8903
8904	/* Do the log updates for all inodes. */
8905	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8906		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8907				   old_rename_ctx.index, new_dentry->d_parent);
8908	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8909		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8910				   new_rename_ctx.index, old_dentry->d_parent);
8911
8912	/* Now unpin the logs. */
8913	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8914		btrfs_end_log_trans(root);
8915	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8916		btrfs_end_log_trans(dest);
8917out_fail:
8918	ret2 = btrfs_end_transaction(trans);
8919	ret = ret ? ret : ret2;
8920out_notrans:
8921	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8922	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
8923		up_read(&fs_info->subvol_sem);
8924
8925	fscrypt_free_filename(&new_fname);
8926	fscrypt_free_filename(&old_fname);
8927	return ret;
8928}
8929
8930static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8931					struct inode *dir)
8932{
8933	struct inode *inode;
8934
8935	inode = new_inode(dir->i_sb);
8936	if (inode) {
8937		inode_init_owner(idmap, inode, dir,
8938				 S_IFCHR | WHITEOUT_MODE);
8939		inode->i_op = &btrfs_special_inode_operations;
8940		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8941	}
8942	return inode;
8943}
8944
8945static int btrfs_rename(struct mnt_idmap *idmap,
8946			struct inode *old_dir, struct dentry *old_dentry,
8947			struct inode *new_dir, struct dentry *new_dentry,
8948			unsigned int flags)
8949{
8950	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8951	struct btrfs_new_inode_args whiteout_args = {
8952		.dir = old_dir,
8953		.dentry = old_dentry,
8954	};
8955	struct btrfs_trans_handle *trans;
8956	unsigned int trans_num_items;
8957	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8958	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8959	struct inode *new_inode = d_inode(new_dentry);
8960	struct inode *old_inode = d_inode(old_dentry);
8961	struct btrfs_rename_ctx rename_ctx;
8962	u64 index = 0;
8963	int ret;
8964	int ret2;
8965	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8966	struct fscrypt_name old_fname, new_fname;
8967
8968	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8969		return -EPERM;
8970
8971	/* we only allow rename subvolume link between subvolumes */
8972	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8973		return -EXDEV;
8974
8975	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8976	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8977		return -ENOTEMPTY;
8978
8979	if (S_ISDIR(old_inode->i_mode) && new_inode &&
8980	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8981		return -ENOTEMPTY;
8982
8983	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8984	if (ret)
8985		return ret;
8986
8987	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8988	if (ret) {
8989		fscrypt_free_filename(&old_fname);
8990		return ret;
8991	}
8992
8993	/* check for collisions, even if the  name isn't there */
8994	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
8995	if (ret) {
8996		if (ret == -EEXIST) {
8997			/* we shouldn't get
8998			 * eexist without a new_inode */
8999			if (WARN_ON(!new_inode)) {
9000				goto out_fscrypt_names;
9001			}
9002		} else {
9003			/* maybe -EOVERFLOW */
9004			goto out_fscrypt_names;
9005		}
9006	}
9007	ret = 0;
9008
9009	/*
9010	 * we're using rename to replace one file with another.  Start IO on it
9011	 * now so  we don't add too much work to the end of the transaction
9012	 */
9013	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9014		filemap_flush(old_inode->i_mapping);
9015
9016	if (flags & RENAME_WHITEOUT) {
9017		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
9018		if (!whiteout_args.inode) {
9019			ret = -ENOMEM;
9020			goto out_fscrypt_names;
9021		}
9022		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
9023		if (ret)
9024			goto out_whiteout_inode;
9025	} else {
9026		/* 1 to update the old parent inode. */
9027		trans_num_items = 1;
9028	}
9029
9030	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9031		/* Close the race window with snapshot create/destroy ioctl */
9032		down_read(&fs_info->subvol_sem);
9033		/*
9034		 * 1 to remove old root ref
9035		 * 1 to remove old root backref
9036		 * 1 to add new root ref
9037		 * 1 to add new root backref
9038		 */
9039		trans_num_items += 4;
9040	} else {
9041		/*
9042		 * 1 to update inode
9043		 * 1 to remove old inode ref
9044		 * 1 to add new inode ref
9045		 */
9046		trans_num_items += 3;
9047	}
9048	/*
9049	 * 1 to remove old dir item
9050	 * 1 to remove old dir index
9051	 * 1 to add new dir item
9052	 * 1 to add new dir index
9053	 */
9054	trans_num_items += 4;
9055	/* 1 to update new parent inode if it's not the same as the old parent */
9056	if (new_dir != old_dir)
9057		trans_num_items++;
9058	if (new_inode) {
9059		/*
9060		 * 1 to update inode
9061		 * 1 to remove inode ref
9062		 * 1 to remove dir item
9063		 * 1 to remove dir index
9064		 * 1 to possibly add orphan item
9065		 */
9066		trans_num_items += 5;
9067	}
9068	trans = btrfs_start_transaction(root, trans_num_items);
9069	if (IS_ERR(trans)) {
9070		ret = PTR_ERR(trans);
9071		goto out_notrans;
9072	}
9073
9074	if (dest != root) {
9075		ret = btrfs_record_root_in_trans(trans, dest);
9076		if (ret)
9077			goto out_fail;
9078	}
9079
9080	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9081	if (ret)
9082		goto out_fail;
9083
9084	BTRFS_I(old_inode)->dir_index = 0ULL;
9085	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9086		/* force full log commit if subvolume involved. */
9087		btrfs_set_log_full_commit(trans);
9088	} else {
9089		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
9090					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
9091					     index);
9092		if (ret)
9093			goto out_fail;
9094	}
9095
9096	inode_inc_iversion(old_dir);
9097	inode_inc_iversion(new_dir);
9098	inode_inc_iversion(old_inode);
9099	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
9100
9101	if (old_dentry->d_parent != new_dentry->d_parent)
9102		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9103					BTRFS_I(old_inode), true);
9104
9105	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9106		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
9107	} else {
9108		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9109					   BTRFS_I(d_inode(old_dentry)),
9110					   &old_fname.disk_name, &rename_ctx);
9111		if (!ret)
9112			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9113	}
9114	if (ret) {
9115		btrfs_abort_transaction(trans, ret);
9116		goto out_fail;
9117	}
9118
9119	if (new_inode) {
9120		inode_inc_iversion(new_inode);
9121		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9122			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9123			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
9124			BUG_ON(new_inode->i_nlink == 0);
9125		} else {
9126			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9127						 BTRFS_I(d_inode(new_dentry)),
9128						 &new_fname.disk_name);
9129		}
9130		if (!ret && new_inode->i_nlink == 0)
9131			ret = btrfs_orphan_add(trans,
9132					BTRFS_I(d_inode(new_dentry)));
9133		if (ret) {
9134			btrfs_abort_transaction(trans, ret);
9135			goto out_fail;
9136		}
9137	}
9138
9139	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9140			     &new_fname.disk_name, 0, index);
9141	if (ret) {
9142		btrfs_abort_transaction(trans, ret);
9143		goto out_fail;
9144	}
9145
9146	if (old_inode->i_nlink == 1)
9147		BTRFS_I(old_inode)->dir_index = index;
9148
9149	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9150		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9151				   rename_ctx.index, new_dentry->d_parent);
9152
9153	if (flags & RENAME_WHITEOUT) {
9154		ret = btrfs_create_new_inode(trans, &whiteout_args);
9155		if (ret) {
9156			btrfs_abort_transaction(trans, ret);
9157			goto out_fail;
9158		} else {
9159			unlock_new_inode(whiteout_args.inode);
9160			iput(whiteout_args.inode);
9161			whiteout_args.inode = NULL;
9162		}
9163	}
9164out_fail:
9165	ret2 = btrfs_end_transaction(trans);
9166	ret = ret ? ret : ret2;
9167out_notrans:
9168	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9169		up_read(&fs_info->subvol_sem);
9170	if (flags & RENAME_WHITEOUT)
9171		btrfs_new_inode_args_destroy(&whiteout_args);
9172out_whiteout_inode:
9173	if (flags & RENAME_WHITEOUT)
9174		iput(whiteout_args.inode);
9175out_fscrypt_names:
9176	fscrypt_free_filename(&old_fname);
9177	fscrypt_free_filename(&new_fname);
9178	return ret;
9179}
9180
9181static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
9182			 struct dentry *old_dentry, struct inode *new_dir,
9183			 struct dentry *new_dentry, unsigned int flags)
9184{
9185	int ret;
9186
9187	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9188		return -EINVAL;
9189
9190	if (flags & RENAME_EXCHANGE)
9191		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9192					    new_dentry);
9193	else
9194		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
9195				   new_dentry, flags);
9196
9197	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
9198
9199	return ret;
9200}
9201
9202struct btrfs_delalloc_work {
9203	struct inode *inode;
9204	struct completion completion;
9205	struct list_head list;
9206	struct btrfs_work work;
9207};
9208
9209static void btrfs_run_delalloc_work(struct btrfs_work *work)
9210{
9211	struct btrfs_delalloc_work *delalloc_work;
9212	struct inode *inode;
9213
9214	delalloc_work = container_of(work, struct btrfs_delalloc_work,
9215				     work);
9216	inode = delalloc_work->inode;
9217	filemap_flush(inode->i_mapping);
9218	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9219				&BTRFS_I(inode)->runtime_flags))
9220		filemap_flush(inode->i_mapping);
9221
9222	iput(inode);
9223	complete(&delalloc_work->completion);
9224}
9225
9226static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9227{
9228	struct btrfs_delalloc_work *work;
9229
9230	work = kmalloc(sizeof(*work), GFP_NOFS);
9231	if (!work)
9232		return NULL;
9233
9234	init_completion(&work->completion);
9235	INIT_LIST_HEAD(&work->list);
9236	work->inode = inode;
9237	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9238
9239	return work;
9240}
9241
9242/*
9243 * some fairly slow code that needs optimization. This walks the list
9244 * of all the inodes with pending delalloc and forces them to disk.
9245 */
9246static int start_delalloc_inodes(struct btrfs_root *root,
9247				 struct writeback_control *wbc, bool snapshot,
9248				 bool in_reclaim_context)
9249{
9250	struct btrfs_inode *binode;
9251	struct inode *inode;
9252	struct btrfs_delalloc_work *work, *next;
9253	LIST_HEAD(works);
9254	LIST_HEAD(splice);
9255	int ret = 0;
9256	bool full_flush = wbc->nr_to_write == LONG_MAX;
9257
9258	mutex_lock(&root->delalloc_mutex);
9259	spin_lock(&root->delalloc_lock);
9260	list_splice_init(&root->delalloc_inodes, &splice);
9261	while (!list_empty(&splice)) {
9262		binode = list_entry(splice.next, struct btrfs_inode,
9263				    delalloc_inodes);
9264
9265		list_move_tail(&binode->delalloc_inodes,
9266			       &root->delalloc_inodes);
9267
9268		if (in_reclaim_context &&
9269		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9270			continue;
9271
9272		inode = igrab(&binode->vfs_inode);
9273		if (!inode) {
9274			cond_resched_lock(&root->delalloc_lock);
9275			continue;
9276		}
9277		spin_unlock(&root->delalloc_lock);
9278
9279		if (snapshot)
9280			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9281				&binode->runtime_flags);
9282		if (full_flush) {
9283			work = btrfs_alloc_delalloc_work(inode);
9284			if (!work) {
9285				iput(inode);
9286				ret = -ENOMEM;
9287				goto out;
9288			}
9289			list_add_tail(&work->list, &works);
9290			btrfs_queue_work(root->fs_info->flush_workers,
9291					 &work->work);
9292		} else {
9293			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9294			btrfs_add_delayed_iput(BTRFS_I(inode));
9295			if (ret || wbc->nr_to_write <= 0)
9296				goto out;
9297		}
9298		cond_resched();
9299		spin_lock(&root->delalloc_lock);
9300	}
9301	spin_unlock(&root->delalloc_lock);
9302
9303out:
9304	list_for_each_entry_safe(work, next, &works, list) {
9305		list_del_init(&work->list);
9306		wait_for_completion(&work->completion);
9307		kfree(work);
9308	}
9309
9310	if (!list_empty(&splice)) {
9311		spin_lock(&root->delalloc_lock);
9312		list_splice_tail(&splice, &root->delalloc_inodes);
9313		spin_unlock(&root->delalloc_lock);
9314	}
9315	mutex_unlock(&root->delalloc_mutex);
9316	return ret;
9317}
9318
9319int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9320{
9321	struct writeback_control wbc = {
9322		.nr_to_write = LONG_MAX,
9323		.sync_mode = WB_SYNC_NONE,
9324		.range_start = 0,
9325		.range_end = LLONG_MAX,
9326	};
9327	struct btrfs_fs_info *fs_info = root->fs_info;
9328
9329	if (BTRFS_FS_ERROR(fs_info))
9330		return -EROFS;
9331
9332	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9333}
9334
9335int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9336			       bool in_reclaim_context)
9337{
9338	struct writeback_control wbc = {
9339		.nr_to_write = nr,
9340		.sync_mode = WB_SYNC_NONE,
9341		.range_start = 0,
9342		.range_end = LLONG_MAX,
9343	};
9344	struct btrfs_root *root;
9345	LIST_HEAD(splice);
9346	int ret;
9347
9348	if (BTRFS_FS_ERROR(fs_info))
9349		return -EROFS;
9350
9351	mutex_lock(&fs_info->delalloc_root_mutex);
9352	spin_lock(&fs_info->delalloc_root_lock);
9353	list_splice_init(&fs_info->delalloc_roots, &splice);
9354	while (!list_empty(&splice)) {
9355		/*
9356		 * Reset nr_to_write here so we know that we're doing a full
9357		 * flush.
9358		 */
9359		if (nr == LONG_MAX)
9360			wbc.nr_to_write = LONG_MAX;
9361
9362		root = list_first_entry(&splice, struct btrfs_root,
9363					delalloc_root);
9364		root = btrfs_grab_root(root);
9365		BUG_ON(!root);
9366		list_move_tail(&root->delalloc_root,
9367			       &fs_info->delalloc_roots);
9368		spin_unlock(&fs_info->delalloc_root_lock);
9369
9370		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9371		btrfs_put_root(root);
9372		if (ret < 0 || wbc.nr_to_write <= 0)
9373			goto out;
9374		spin_lock(&fs_info->delalloc_root_lock);
9375	}
9376	spin_unlock(&fs_info->delalloc_root_lock);
9377
9378	ret = 0;
9379out:
9380	if (!list_empty(&splice)) {
9381		spin_lock(&fs_info->delalloc_root_lock);
9382		list_splice_tail(&splice, &fs_info->delalloc_roots);
9383		spin_unlock(&fs_info->delalloc_root_lock);
9384	}
9385	mutex_unlock(&fs_info->delalloc_root_mutex);
9386	return ret;
9387}
9388
9389static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
9390			 struct dentry *dentry, const char *symname)
9391{
9392	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9393	struct btrfs_trans_handle *trans;
9394	struct btrfs_root *root = BTRFS_I(dir)->root;
9395	struct btrfs_path *path;
9396	struct btrfs_key key;
9397	struct inode *inode;
9398	struct btrfs_new_inode_args new_inode_args = {
9399		.dir = dir,
9400		.dentry = dentry,
9401	};
9402	unsigned int trans_num_items;
9403	int err;
9404	int name_len;
9405	int datasize;
9406	unsigned long ptr;
9407	struct btrfs_file_extent_item *ei;
9408	struct extent_buffer *leaf;
9409
9410	name_len = strlen(symname);
9411	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
9412		return -ENAMETOOLONG;
9413
9414	inode = new_inode(dir->i_sb);
9415	if (!inode)
9416		return -ENOMEM;
9417	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
9418	inode->i_op = &btrfs_symlink_inode_operations;
9419	inode_nohighmem(inode);
9420	inode->i_mapping->a_ops = &btrfs_aops;
9421	btrfs_i_size_write(BTRFS_I(inode), name_len);
9422	inode_set_bytes(inode, name_len);
9423
9424	new_inode_args.inode = inode;
9425	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9426	if (err)
9427		goto out_inode;
9428	/* 1 additional item for the inline extent */
9429	trans_num_items++;
9430
9431	trans = btrfs_start_transaction(root, trans_num_items);
9432	if (IS_ERR(trans)) {
9433		err = PTR_ERR(trans);
9434		goto out_new_inode_args;
9435	}
9436
9437	err = btrfs_create_new_inode(trans, &new_inode_args);
9438	if (err)
9439		goto out;
9440
9441	path = btrfs_alloc_path();
9442	if (!path) {
9443		err = -ENOMEM;
9444		btrfs_abort_transaction(trans, err);
9445		discard_new_inode(inode);
9446		inode = NULL;
9447		goto out;
9448	}
9449	key.objectid = btrfs_ino(BTRFS_I(inode));
9450	key.offset = 0;
9451	key.type = BTRFS_EXTENT_DATA_KEY;
9452	datasize = btrfs_file_extent_calc_inline_size(name_len);
9453	err = btrfs_insert_empty_item(trans, root, path, &key,
9454				      datasize);
9455	if (err) {
9456		btrfs_abort_transaction(trans, err);
9457		btrfs_free_path(path);
9458		discard_new_inode(inode);
9459		inode = NULL;
9460		goto out;
9461	}
9462	leaf = path->nodes[0];
9463	ei = btrfs_item_ptr(leaf, path->slots[0],
9464			    struct btrfs_file_extent_item);
9465	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9466	btrfs_set_file_extent_type(leaf, ei,
9467				   BTRFS_FILE_EXTENT_INLINE);
9468	btrfs_set_file_extent_encryption(leaf, ei, 0);
9469	btrfs_set_file_extent_compression(leaf, ei, 0);
9470	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9471	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9472
9473	ptr = btrfs_file_extent_inline_start(ei);
9474	write_extent_buffer(leaf, symname, ptr, name_len);
9475	btrfs_mark_buffer_dirty(trans, leaf);
9476	btrfs_free_path(path);
9477
9478	d_instantiate_new(dentry, inode);
9479	err = 0;
9480out:
9481	btrfs_end_transaction(trans);
9482	btrfs_btree_balance_dirty(fs_info);
9483out_new_inode_args:
9484	btrfs_new_inode_args_destroy(&new_inode_args);
9485out_inode:
9486	if (err)
9487		iput(inode);
9488	return err;
9489}
9490
9491static struct btrfs_trans_handle *insert_prealloc_file_extent(
9492				       struct btrfs_trans_handle *trans_in,
9493				       struct btrfs_inode *inode,
9494				       struct btrfs_key *ins,
9495				       u64 file_offset)
9496{
9497	struct btrfs_file_extent_item stack_fi;
9498	struct btrfs_replace_extent_info extent_info;
9499	struct btrfs_trans_handle *trans = trans_in;
9500	struct btrfs_path *path;
9501	u64 start = ins->objectid;
9502	u64 len = ins->offset;
9503	u64 qgroup_released = 0;
9504	int ret;
9505
9506	memset(&stack_fi, 0, sizeof(stack_fi));
9507
9508	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9509	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9510	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9511	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9512	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9513	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9514	/* Encryption and other encoding is reserved and all 0 */
9515
9516	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
9517	if (ret < 0)
9518		return ERR_PTR(ret);
9519
9520	if (trans) {
9521		ret = insert_reserved_file_extent(trans, inode,
9522						  file_offset, &stack_fi,
9523						  true, qgroup_released);
9524		if (ret)
9525			goto free_qgroup;
9526		return trans;
9527	}
9528
9529	extent_info.disk_offset = start;
9530	extent_info.disk_len = len;
9531	extent_info.data_offset = 0;
9532	extent_info.data_len = len;
9533	extent_info.file_offset = file_offset;
9534	extent_info.extent_buf = (char *)&stack_fi;
9535	extent_info.is_new_extent = true;
9536	extent_info.update_times = true;
9537	extent_info.qgroup_reserved = qgroup_released;
9538	extent_info.insertions = 0;
9539
9540	path = btrfs_alloc_path();
9541	if (!path) {
9542		ret = -ENOMEM;
9543		goto free_qgroup;
9544	}
9545
9546	ret = btrfs_replace_file_extents(inode, path, file_offset,
9547				     file_offset + len - 1, &extent_info,
9548				     &trans);
9549	btrfs_free_path(path);
9550	if (ret)
9551		goto free_qgroup;
9552	return trans;
9553
9554free_qgroup:
9555	/*
9556	 * We have released qgroup data range at the beginning of the function,
9557	 * and normally qgroup_released bytes will be freed when committing
9558	 * transaction.
9559	 * But if we error out early, we have to free what we have released
9560	 * or we leak qgroup data reservation.
9561	 */
9562	btrfs_qgroup_free_refroot(inode->root->fs_info,
9563			inode->root->root_key.objectid, qgroup_released,
9564			BTRFS_QGROUP_RSV_DATA);
9565	return ERR_PTR(ret);
9566}
9567
9568static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9569				       u64 start, u64 num_bytes, u64 min_size,
9570				       loff_t actual_len, u64 *alloc_hint,
9571				       struct btrfs_trans_handle *trans)
9572{
9573	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9574	struct extent_map *em;
9575	struct btrfs_root *root = BTRFS_I(inode)->root;
9576	struct btrfs_key ins;
9577	u64 cur_offset = start;
9578	u64 clear_offset = start;
9579	u64 i_size;
9580	u64 cur_bytes;
9581	u64 last_alloc = (u64)-1;
9582	int ret = 0;
9583	bool own_trans = true;
9584	u64 end = start + num_bytes - 1;
9585
9586	if (trans)
9587		own_trans = false;
9588	while (num_bytes > 0) {
9589		cur_bytes = min_t(u64, num_bytes, SZ_256M);
9590		cur_bytes = max(cur_bytes, min_size);
9591		/*
9592		 * If we are severely fragmented we could end up with really
9593		 * small allocations, so if the allocator is returning small
9594		 * chunks lets make its job easier by only searching for those
9595		 * sized chunks.
9596		 */
9597		cur_bytes = min(cur_bytes, last_alloc);
9598		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9599				min_size, 0, *alloc_hint, &ins, 1, 0);
9600		if (ret)
9601			break;
9602
9603		/*
9604		 * We've reserved this space, and thus converted it from
9605		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
9606		 * from here on out we will only need to clear our reservation
9607		 * for the remaining unreserved area, so advance our
9608		 * clear_offset by our extent size.
9609		 */
9610		clear_offset += ins.offset;
9611
9612		last_alloc = ins.offset;
9613		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9614						    &ins, cur_offset);
9615		/*
9616		 * Now that we inserted the prealloc extent we can finally
9617		 * decrement the number of reservations in the block group.
9618		 * If we did it before, we could race with relocation and have
9619		 * relocation miss the reserved extent, making it fail later.
9620		 */
9621		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9622		if (IS_ERR(trans)) {
9623			ret = PTR_ERR(trans);
9624			btrfs_free_reserved_extent(fs_info, ins.objectid,
9625						   ins.offset, 0);
9626			break;
9627		}
9628
9629		em = alloc_extent_map();
9630		if (!em) {
9631			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9632					    cur_offset + ins.offset - 1, false);
9633			btrfs_set_inode_full_sync(BTRFS_I(inode));
9634			goto next;
9635		}
9636
9637		em->start = cur_offset;
9638		em->orig_start = cur_offset;
9639		em->len = ins.offset;
9640		em->block_start = ins.objectid;
9641		em->block_len = ins.offset;
9642		em->orig_block_len = ins.offset;
9643		em->ram_bytes = ins.offset;
9644		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9645		em->generation = trans->transid;
9646
9647		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9648		free_extent_map(em);
9649next:
9650		num_bytes -= ins.offset;
9651		cur_offset += ins.offset;
9652		*alloc_hint = ins.objectid + ins.offset;
9653
9654		inode_inc_iversion(inode);
9655		inode_set_ctime_current(inode);
9656		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9657		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9658		    (actual_len > inode->i_size) &&
9659		    (cur_offset > inode->i_size)) {
9660			if (cur_offset > actual_len)
9661				i_size = actual_len;
9662			else
9663				i_size = cur_offset;
9664			i_size_write(inode, i_size);
9665			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9666		}
9667
9668		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9669
9670		if (ret) {
9671			btrfs_abort_transaction(trans, ret);
9672			if (own_trans)
9673				btrfs_end_transaction(trans);
9674			break;
9675		}
9676
9677		if (own_trans) {
9678			btrfs_end_transaction(trans);
9679			trans = NULL;
9680		}
9681	}
9682	if (clear_offset < end)
9683		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9684			end - clear_offset + 1);
9685	return ret;
9686}
9687
9688int btrfs_prealloc_file_range(struct inode *inode, int mode,
9689			      u64 start, u64 num_bytes, u64 min_size,
9690			      loff_t actual_len, u64 *alloc_hint)
9691{
9692	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9693					   min_size, actual_len, alloc_hint,
9694					   NULL);
9695}
9696
9697int btrfs_prealloc_file_range_trans(struct inode *inode,
9698				    struct btrfs_trans_handle *trans, int mode,
9699				    u64 start, u64 num_bytes, u64 min_size,
9700				    loff_t actual_len, u64 *alloc_hint)
9701{
9702	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9703					   min_size, actual_len, alloc_hint, trans);
9704}
9705
9706static int btrfs_permission(struct mnt_idmap *idmap,
9707			    struct inode *inode, int mask)
9708{
9709	struct btrfs_root *root = BTRFS_I(inode)->root;
9710	umode_t mode = inode->i_mode;
9711
9712	if (mask & MAY_WRITE &&
9713	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9714		if (btrfs_root_readonly(root))
9715			return -EROFS;
9716		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9717			return -EACCES;
9718	}
9719	return generic_permission(idmap, inode, mask);
9720}
9721
9722static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9723			 struct file *file, umode_t mode)
9724{
9725	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9726	struct btrfs_trans_handle *trans;
9727	struct btrfs_root *root = BTRFS_I(dir)->root;
9728	struct inode *inode;
9729	struct btrfs_new_inode_args new_inode_args = {
9730		.dir = dir,
9731		.dentry = file->f_path.dentry,
9732		.orphan = true,
9733	};
9734	unsigned int trans_num_items;
9735	int ret;
9736
9737	inode = new_inode(dir->i_sb);
9738	if (!inode)
9739		return -ENOMEM;
9740	inode_init_owner(idmap, inode, dir, mode);
9741	inode->i_fop = &btrfs_file_operations;
9742	inode->i_op = &btrfs_file_inode_operations;
9743	inode->i_mapping->a_ops = &btrfs_aops;
9744
9745	new_inode_args.inode = inode;
9746	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9747	if (ret)
9748		goto out_inode;
9749
9750	trans = btrfs_start_transaction(root, trans_num_items);
9751	if (IS_ERR(trans)) {
9752		ret = PTR_ERR(trans);
9753		goto out_new_inode_args;
9754	}
9755
9756	ret = btrfs_create_new_inode(trans, &new_inode_args);
9757
9758	/*
9759	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9760	 * set it to 1 because d_tmpfile() will issue a warning if the count is
9761	 * 0, through:
9762	 *
9763	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9764	 */
9765	set_nlink(inode, 1);
9766
9767	if (!ret) {
9768		d_tmpfile(file, inode);
9769		unlock_new_inode(inode);
9770		mark_inode_dirty(inode);
9771	}
9772
9773	btrfs_end_transaction(trans);
9774	btrfs_btree_balance_dirty(fs_info);
9775out_new_inode_args:
9776	btrfs_new_inode_args_destroy(&new_inode_args);
9777out_inode:
9778	if (ret)
9779		iput(inode);
9780	return finish_open_simple(file, ret);
9781}
9782
9783void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
9784{
9785	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9786	unsigned long index = start >> PAGE_SHIFT;
9787	unsigned long end_index = end >> PAGE_SHIFT;
9788	struct page *page;
9789	u32 len;
9790
9791	ASSERT(end + 1 - start <= U32_MAX);
9792	len = end + 1 - start;
9793	while (index <= end_index) {
9794		page = find_get_page(inode->vfs_inode.i_mapping, index);
9795		ASSERT(page); /* Pages should be in the extent_io_tree */
9796
9797		btrfs_page_set_writeback(fs_info, page, start, len);
9798		put_page(page);
9799		index++;
9800	}
9801}
9802
9803int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9804					     int compress_type)
9805{
9806	switch (compress_type) {
9807	case BTRFS_COMPRESS_NONE:
9808		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9809	case BTRFS_COMPRESS_ZLIB:
9810		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9811	case BTRFS_COMPRESS_LZO:
9812		/*
9813		 * The LZO format depends on the sector size. 64K is the maximum
9814		 * sector size that we support.
9815		 */
9816		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9817			return -EINVAL;
9818		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9819		       (fs_info->sectorsize_bits - 12);
9820	case BTRFS_COMPRESS_ZSTD:
9821		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9822	default:
9823		return -EUCLEAN;
9824	}
9825}
9826
9827static ssize_t btrfs_encoded_read_inline(
9828				struct kiocb *iocb,
9829				struct iov_iter *iter, u64 start,
9830				u64 lockend,
9831				struct extent_state **cached_state,
9832				u64 extent_start, size_t count,
9833				struct btrfs_ioctl_encoded_io_args *encoded,
9834				bool *unlocked)
9835{
9836	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9837	struct btrfs_root *root = inode->root;
9838	struct btrfs_fs_info *fs_info = root->fs_info;
9839	struct extent_io_tree *io_tree = &inode->io_tree;
9840	struct btrfs_path *path;
9841	struct extent_buffer *leaf;
9842	struct btrfs_file_extent_item *item;
9843	u64 ram_bytes;
9844	unsigned long ptr;
9845	void *tmp;
9846	ssize_t ret;
9847
9848	path = btrfs_alloc_path();
9849	if (!path) {
9850		ret = -ENOMEM;
9851		goto out;
9852	}
9853	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9854				       extent_start, 0);
9855	if (ret) {
9856		if (ret > 0) {
9857			/* The extent item disappeared? */
9858			ret = -EIO;
9859		}
9860		goto out;
9861	}
9862	leaf = path->nodes[0];
9863	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9864
9865	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9866	ptr = btrfs_file_extent_inline_start(item);
9867
9868	encoded->len = min_t(u64, extent_start + ram_bytes,
9869			     inode->vfs_inode.i_size) - iocb->ki_pos;
9870	ret = btrfs_encoded_io_compression_from_extent(fs_info,
9871				 btrfs_file_extent_compression(leaf, item));
9872	if (ret < 0)
9873		goto out;
9874	encoded->compression = ret;
9875	if (encoded->compression) {
9876		size_t inline_size;
9877
9878		inline_size = btrfs_file_extent_inline_item_len(leaf,
9879								path->slots[0]);
9880		if (inline_size > count) {
9881			ret = -ENOBUFS;
9882			goto out;
9883		}
9884		count = inline_size;
9885		encoded->unencoded_len = ram_bytes;
9886		encoded->unencoded_offset = iocb->ki_pos - extent_start;
9887	} else {
9888		count = min_t(u64, count, encoded->len);
9889		encoded->len = count;
9890		encoded->unencoded_len = count;
9891		ptr += iocb->ki_pos - extent_start;
9892	}
9893
9894	tmp = kmalloc(count, GFP_NOFS);
9895	if (!tmp) {
9896		ret = -ENOMEM;
9897		goto out;
9898	}
9899	read_extent_buffer(leaf, tmp, ptr, count);
9900	btrfs_release_path(path);
9901	unlock_extent(io_tree, start, lockend, cached_state);
9902	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9903	*unlocked = true;
9904
9905	ret = copy_to_iter(tmp, count, iter);
9906	if (ret != count)
9907		ret = -EFAULT;
9908	kfree(tmp);
9909out:
9910	btrfs_free_path(path);
9911	return ret;
9912}
9913
9914struct btrfs_encoded_read_private {
9915	wait_queue_head_t wait;
9916	atomic_t pending;
9917	blk_status_t status;
9918};
9919
9920static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9921{
9922	struct btrfs_encoded_read_private *priv = bbio->private;
9923
9924	if (bbio->bio.bi_status) {
9925		/*
9926		 * The memory barrier implied by the atomic_dec_return() here
9927		 * pairs with the memory barrier implied by the
9928		 * atomic_dec_return() or io_wait_event() in
9929		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
9930		 * write is observed before the load of status in
9931		 * btrfs_encoded_read_regular_fill_pages().
9932		 */
9933		WRITE_ONCE(priv->status, bbio->bio.bi_status);
9934	}
9935	if (!atomic_dec_return(&priv->pending))
9936		wake_up(&priv->wait);
9937	bio_put(&bbio->bio);
9938}
9939
9940int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9941					  u64 file_offset, u64 disk_bytenr,
9942					  u64 disk_io_size, struct page **pages)
9943{
9944	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9945	struct btrfs_encoded_read_private priv = {
9946		.pending = ATOMIC_INIT(1),
9947	};
9948	unsigned long i = 0;
9949	struct btrfs_bio *bbio;
9950
9951	init_waitqueue_head(&priv.wait);
9952
9953	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9954			       btrfs_encoded_read_endio, &priv);
9955	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9956	bbio->inode = inode;
9957
9958	do {
9959		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9960
9961		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9962			atomic_inc(&priv.pending);
9963			btrfs_submit_bio(bbio, 0);
9964
9965			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9966					       btrfs_encoded_read_endio, &priv);
9967			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9968			bbio->inode = inode;
9969			continue;
9970		}
9971
9972		i++;
9973		disk_bytenr += bytes;
9974		disk_io_size -= bytes;
9975	} while (disk_io_size);
9976
9977	atomic_inc(&priv.pending);
9978	btrfs_submit_bio(bbio, 0);
9979
9980	if (atomic_dec_return(&priv.pending))
9981		io_wait_event(priv.wait, !atomic_read(&priv.pending));
9982	/* See btrfs_encoded_read_endio() for ordering. */
9983	return blk_status_to_errno(READ_ONCE(priv.status));
9984}
9985
9986static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
9987					  struct iov_iter *iter,
9988					  u64 start, u64 lockend,
9989					  struct extent_state **cached_state,
9990					  u64 disk_bytenr, u64 disk_io_size,
9991					  size_t count, bool compressed,
9992					  bool *unlocked)
9993{
9994	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9995	struct extent_io_tree *io_tree = &inode->io_tree;
9996	struct page **pages;
9997	unsigned long nr_pages, i;
9998	u64 cur;
9999	size_t page_offset;
10000	ssize_t ret;
10001
10002	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
10003	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
10004	if (!pages)
10005		return -ENOMEM;
10006	ret = btrfs_alloc_page_array(nr_pages, pages);
10007	if (ret) {
10008		ret = -ENOMEM;
10009		goto out;
10010		}
10011
10012	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10013						    disk_io_size, pages);
10014	if (ret)
10015		goto out;
10016
10017	unlock_extent(io_tree, start, lockend, cached_state);
10018	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10019	*unlocked = true;
10020
10021	if (compressed) {
10022		i = 0;
10023		page_offset = 0;
10024	} else {
10025		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10026		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10027	}
10028	cur = 0;
10029	while (cur < count) {
10030		size_t bytes = min_t(size_t, count - cur,
10031				     PAGE_SIZE - page_offset);
10032
10033		if (copy_page_to_iter(pages[i], page_offset, bytes,
10034				      iter) != bytes) {
10035			ret = -EFAULT;
10036			goto out;
10037		}
10038		i++;
10039		cur += bytes;
10040		page_offset = 0;
10041	}
10042	ret = count;
10043out:
10044	for (i = 0; i < nr_pages; i++) {
10045		if (pages[i])
10046			__free_page(pages[i]);
10047	}
10048	kfree(pages);
10049	return ret;
10050}
10051
10052ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10053			   struct btrfs_ioctl_encoded_io_args *encoded)
10054{
10055	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10056	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10057	struct extent_io_tree *io_tree = &inode->io_tree;
10058	ssize_t ret;
10059	size_t count = iov_iter_count(iter);
10060	u64 start, lockend, disk_bytenr, disk_io_size;
10061	struct extent_state *cached_state = NULL;
10062	struct extent_map *em;
10063	bool unlocked = false;
10064
10065	file_accessed(iocb->ki_filp);
10066
10067	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
10068
10069	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10070		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10071		return 0;
10072	}
10073	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10074	/*
10075	 * We don't know how long the extent containing iocb->ki_pos is, but if
10076	 * it's compressed we know that it won't be longer than this.
10077	 */
10078	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10079
10080	for (;;) {
10081		struct btrfs_ordered_extent *ordered;
10082
10083		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10084					       lockend - start + 1);
10085		if (ret)
10086			goto out_unlock_inode;
10087		lock_extent(io_tree, start, lockend, &cached_state);
10088		ordered = btrfs_lookup_ordered_range(inode, start,
10089						     lockend - start + 1);
10090		if (!ordered)
10091			break;
10092		btrfs_put_ordered_extent(ordered);
10093		unlock_extent(io_tree, start, lockend, &cached_state);
10094		cond_resched();
10095	}
10096
10097	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10098	if (IS_ERR(em)) {
10099		ret = PTR_ERR(em);
10100		goto out_unlock_extent;
10101	}
10102
10103	if (em->block_start == EXTENT_MAP_INLINE) {
10104		u64 extent_start = em->start;
10105
10106		/*
10107		 * For inline extents we get everything we need out of the
10108		 * extent item.
10109		 */
10110		free_extent_map(em);
10111		em = NULL;
10112		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10113						&cached_state, extent_start,
10114						count, encoded, &unlocked);
10115		goto out;
10116	}
10117
10118	/*
10119	 * We only want to return up to EOF even if the extent extends beyond
10120	 * that.
10121	 */
10122	encoded->len = min_t(u64, extent_map_end(em),
10123			     inode->vfs_inode.i_size) - iocb->ki_pos;
10124	if (em->block_start == EXTENT_MAP_HOLE ||
10125	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10126		disk_bytenr = EXTENT_MAP_HOLE;
10127		count = min_t(u64, count, encoded->len);
10128		encoded->len = count;
10129		encoded->unencoded_len = count;
10130	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10131		disk_bytenr = em->block_start;
10132		/*
10133		 * Bail if the buffer isn't large enough to return the whole
10134		 * compressed extent.
10135		 */
10136		if (em->block_len > count) {
10137			ret = -ENOBUFS;
10138			goto out_em;
10139		}
10140		disk_io_size = em->block_len;
10141		count = em->block_len;
10142		encoded->unencoded_len = em->ram_bytes;
10143		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10144		ret = btrfs_encoded_io_compression_from_extent(fs_info,
10145							     em->compress_type);
10146		if (ret < 0)
10147			goto out_em;
10148		encoded->compression = ret;
10149	} else {
10150		disk_bytenr = em->block_start + (start - em->start);
10151		if (encoded->len > count)
10152			encoded->len = count;
10153		/*
10154		 * Don't read beyond what we locked. This also limits the page
10155		 * allocations that we'll do.
10156		 */
10157		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10158		count = start + disk_io_size - iocb->ki_pos;
10159		encoded->len = count;
10160		encoded->unencoded_len = count;
10161		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10162	}
10163	free_extent_map(em);
10164	em = NULL;
10165
10166	if (disk_bytenr == EXTENT_MAP_HOLE) {
10167		unlock_extent(io_tree, start, lockend, &cached_state);
10168		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10169		unlocked = true;
10170		ret = iov_iter_zero(count, iter);
10171		if (ret != count)
10172			ret = -EFAULT;
10173	} else {
10174		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10175						 &cached_state, disk_bytenr,
10176						 disk_io_size, count,
10177						 encoded->compression,
10178						 &unlocked);
10179	}
10180
10181out:
10182	if (ret >= 0)
10183		iocb->ki_pos += encoded->len;
10184out_em:
10185	free_extent_map(em);
10186out_unlock_extent:
10187	if (!unlocked)
10188		unlock_extent(io_tree, start, lockend, &cached_state);
10189out_unlock_inode:
10190	if (!unlocked)
10191		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10192	return ret;
10193}
10194
10195ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10196			       const struct btrfs_ioctl_encoded_io_args *encoded)
10197{
10198	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10199	struct btrfs_root *root = inode->root;
10200	struct btrfs_fs_info *fs_info = root->fs_info;
10201	struct extent_io_tree *io_tree = &inode->io_tree;
10202	struct extent_changeset *data_reserved = NULL;
10203	struct extent_state *cached_state = NULL;
10204	struct btrfs_ordered_extent *ordered;
10205	int compression;
10206	size_t orig_count;
10207	u64 start, end;
10208	u64 num_bytes, ram_bytes, disk_num_bytes;
10209	unsigned long nr_pages, i;
10210	struct page **pages;
10211	struct btrfs_key ins;
10212	bool extent_reserved = false;
10213	struct extent_map *em;
10214	ssize_t ret;
10215
10216	switch (encoded->compression) {
10217	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10218		compression = BTRFS_COMPRESS_ZLIB;
10219		break;
10220	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10221		compression = BTRFS_COMPRESS_ZSTD;
10222		break;
10223	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10224	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10225	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10226	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10227	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10228		/* The sector size must match for LZO. */
10229		if (encoded->compression -
10230		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10231		    fs_info->sectorsize_bits)
10232			return -EINVAL;
10233		compression = BTRFS_COMPRESS_LZO;
10234		break;
10235	default:
10236		return -EINVAL;
10237	}
10238	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10239		return -EINVAL;
10240
10241	/*
10242	 * Compressed extents should always have checksums, so error out if we
10243	 * have a NOCOW file or inode was created while mounted with NODATASUM.
10244	 */
10245	if (inode->flags & BTRFS_INODE_NODATASUM)
10246		return -EINVAL;
10247
10248	orig_count = iov_iter_count(from);
10249
10250	/* The extent size must be sane. */
10251	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10252	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10253		return -EINVAL;
10254
10255	/*
10256	 * The compressed data must be smaller than the decompressed data.
10257	 *
10258	 * It's of course possible for data to compress to larger or the same
10259	 * size, but the buffered I/O path falls back to no compression for such
10260	 * data, and we don't want to break any assumptions by creating these
10261	 * extents.
10262	 *
10263	 * Note that this is less strict than the current check we have that the
10264	 * compressed data must be at least one sector smaller than the
10265	 * decompressed data. We only want to enforce the weaker requirement
10266	 * from old kernels that it is at least one byte smaller.
10267	 */
10268	if (orig_count >= encoded->unencoded_len)
10269		return -EINVAL;
10270
10271	/* The extent must start on a sector boundary. */
10272	start = iocb->ki_pos;
10273	if (!IS_ALIGNED(start, fs_info->sectorsize))
10274		return -EINVAL;
10275
10276	/*
10277	 * The extent must end on a sector boundary. However, we allow a write
10278	 * which ends at or extends i_size to have an unaligned length; we round
10279	 * up the extent size and set i_size to the unaligned end.
10280	 */
10281	if (start + encoded->len < inode->vfs_inode.i_size &&
10282	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10283		return -EINVAL;
10284
10285	/* Finally, the offset in the unencoded data must be sector-aligned. */
10286	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10287		return -EINVAL;
10288
10289	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10290	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10291	end = start + num_bytes - 1;
10292
10293	/*
10294	 * If the extent cannot be inline, the compressed data on disk must be
10295	 * sector-aligned. For convenience, we extend it with zeroes if it
10296	 * isn't.
10297	 */
10298	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10299	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10300	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10301	if (!pages)
10302		return -ENOMEM;
10303	for (i = 0; i < nr_pages; i++) {
10304		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10305		char *kaddr;
10306
10307		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10308		if (!pages[i]) {
10309			ret = -ENOMEM;
10310			goto out_pages;
10311		}
10312		kaddr = kmap_local_page(pages[i]);
10313		if (copy_from_iter(kaddr, bytes, from) != bytes) {
10314			kunmap_local(kaddr);
10315			ret = -EFAULT;
10316			goto out_pages;
10317		}
10318		if (bytes < PAGE_SIZE)
10319			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10320		kunmap_local(kaddr);
10321	}
10322
10323	for (;;) {
10324		struct btrfs_ordered_extent *ordered;
10325
10326		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10327		if (ret)
10328			goto out_pages;
10329		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10330						    start >> PAGE_SHIFT,
10331						    end >> PAGE_SHIFT);
10332		if (ret)
10333			goto out_pages;
10334		lock_extent(io_tree, start, end, &cached_state);
10335		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10336		if (!ordered &&
10337		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10338			break;
10339		if (ordered)
10340			btrfs_put_ordered_extent(ordered);
10341		unlock_extent(io_tree, start, end, &cached_state);
10342		cond_resched();
10343	}
10344
10345	/*
10346	 * We don't use the higher-level delalloc space functions because our
10347	 * num_bytes and disk_num_bytes are different.
10348	 */
10349	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10350	if (ret)
10351		goto out_unlock;
10352	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10353	if (ret)
10354		goto out_free_data_space;
10355	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10356					      false);
10357	if (ret)
10358		goto out_qgroup_free_data;
10359
10360	/* Try an inline extent first. */
10361	if (start == 0 && encoded->unencoded_len == encoded->len &&
10362	    encoded->unencoded_offset == 0) {
10363		ret = cow_file_range_inline(inode, encoded->len, orig_count,
10364					    compression, pages, true);
10365		if (ret <= 0) {
10366			if (ret == 0)
10367				ret = orig_count;
10368			goto out_delalloc_release;
10369		}
10370	}
10371
10372	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10373				   disk_num_bytes, 0, 0, &ins, 1, 1);
10374	if (ret)
10375		goto out_delalloc_release;
10376	extent_reserved = true;
10377
10378	em = create_io_em(inode, start, num_bytes,
10379			  start - encoded->unencoded_offset, ins.objectid,
10380			  ins.offset, ins.offset, ram_bytes, compression,
10381			  BTRFS_ORDERED_COMPRESSED);
10382	if (IS_ERR(em)) {
10383		ret = PTR_ERR(em);
10384		goto out_free_reserved;
10385	}
10386	free_extent_map(em);
10387
10388	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
10389				       ins.objectid, ins.offset,
10390				       encoded->unencoded_offset,
10391				       (1 << BTRFS_ORDERED_ENCODED) |
10392				       (1 << BTRFS_ORDERED_COMPRESSED),
10393				       compression);
10394	if (IS_ERR(ordered)) {
10395		btrfs_drop_extent_map_range(inode, start, end, false);
10396		ret = PTR_ERR(ordered);
10397		goto out_free_reserved;
10398	}
10399	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10400
10401	if (start + encoded->len > inode->vfs_inode.i_size)
10402		i_size_write(&inode->vfs_inode, start + encoded->len);
10403
10404	unlock_extent(io_tree, start, end, &cached_state);
10405
10406	btrfs_delalloc_release_extents(inode, num_bytes);
10407
10408	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
10409	ret = orig_count;
10410	goto out;
10411
10412out_free_reserved:
10413	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10414	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10415out_delalloc_release:
10416	btrfs_delalloc_release_extents(inode, num_bytes);
10417	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10418out_qgroup_free_data:
10419	if (ret < 0)
10420		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
10421out_free_data_space:
10422	/*
10423	 * If btrfs_reserve_extent() succeeded, then we already decremented
10424	 * bytes_may_use.
10425	 */
10426	if (!extent_reserved)
10427		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10428out_unlock:
10429	unlock_extent(io_tree, start, end, &cached_state);
10430out_pages:
10431	for (i = 0; i < nr_pages; i++) {
10432		if (pages[i])
10433			__free_page(pages[i]);
10434	}
10435	kvfree(pages);
10436out:
10437	if (ret >= 0)
10438		iocb->ki_pos += encoded->len;
10439	return ret;
10440}
10441
10442#ifdef CONFIG_SWAP
10443/*
10444 * Add an entry indicating a block group or device which is pinned by a
10445 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10446 * negative errno on failure.
10447 */
10448static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10449				  bool is_block_group)
10450{
10451	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10452	struct btrfs_swapfile_pin *sp, *entry;
10453	struct rb_node **p;
10454	struct rb_node *parent = NULL;
10455
10456	sp = kmalloc(sizeof(*sp), GFP_NOFS);
10457	if (!sp)
10458		return -ENOMEM;
10459	sp->ptr = ptr;
10460	sp->inode = inode;
10461	sp->is_block_group = is_block_group;
10462	sp->bg_extent_count = 1;
10463
10464	spin_lock(&fs_info->swapfile_pins_lock);
10465	p = &fs_info->swapfile_pins.rb_node;
10466	while (*p) {
10467		parent = *p;
10468		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10469		if (sp->ptr < entry->ptr ||
10470		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10471			p = &(*p)->rb_left;
10472		} else if (sp->ptr > entry->ptr ||
10473			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10474			p = &(*p)->rb_right;
10475		} else {
10476			if (is_block_group)
10477				entry->bg_extent_count++;
10478			spin_unlock(&fs_info->swapfile_pins_lock);
10479			kfree(sp);
10480			return 1;
10481		}
10482	}
10483	rb_link_node(&sp->node, parent, p);
10484	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10485	spin_unlock(&fs_info->swapfile_pins_lock);
10486	return 0;
10487}
10488
10489/* Free all of the entries pinned by this swapfile. */
10490static void btrfs_free_swapfile_pins(struct inode *inode)
10491{
10492	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10493	struct btrfs_swapfile_pin *sp;
10494	struct rb_node *node, *next;
10495
10496	spin_lock(&fs_info->swapfile_pins_lock);
10497	node = rb_first(&fs_info->swapfile_pins);
10498	while (node) {
10499		next = rb_next(node);
10500		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10501		if (sp->inode == inode) {
10502			rb_erase(&sp->node, &fs_info->swapfile_pins);
10503			if (sp->is_block_group) {
10504				btrfs_dec_block_group_swap_extents(sp->ptr,
10505							   sp->bg_extent_count);
10506				btrfs_put_block_group(sp->ptr);
10507			}
10508			kfree(sp);
10509		}
10510		node = next;
10511	}
10512	spin_unlock(&fs_info->swapfile_pins_lock);
10513}
10514
10515struct btrfs_swap_info {
10516	u64 start;
10517	u64 block_start;
10518	u64 block_len;
10519	u64 lowest_ppage;
10520	u64 highest_ppage;
10521	unsigned long nr_pages;
10522	int nr_extents;
10523};
10524
10525static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10526				 struct btrfs_swap_info *bsi)
10527{
10528	unsigned long nr_pages;
10529	unsigned long max_pages;
10530	u64 first_ppage, first_ppage_reported, next_ppage;
10531	int ret;
10532
10533	/*
10534	 * Our swapfile may have had its size extended after the swap header was
10535	 * written. In that case activating the swapfile should not go beyond
10536	 * the max size set in the swap header.
10537	 */
10538	if (bsi->nr_pages >= sis->max)
10539		return 0;
10540
10541	max_pages = sis->max - bsi->nr_pages;
10542	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10543	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10544
10545	if (first_ppage >= next_ppage)
10546		return 0;
10547	nr_pages = next_ppage - first_ppage;
10548	nr_pages = min(nr_pages, max_pages);
10549
10550	first_ppage_reported = first_ppage;
10551	if (bsi->start == 0)
10552		first_ppage_reported++;
10553	if (bsi->lowest_ppage > first_ppage_reported)
10554		bsi->lowest_ppage = first_ppage_reported;
10555	if (bsi->highest_ppage < (next_ppage - 1))
10556		bsi->highest_ppage = next_ppage - 1;
10557
10558	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10559	if (ret < 0)
10560		return ret;
10561	bsi->nr_extents += ret;
10562	bsi->nr_pages += nr_pages;
10563	return 0;
10564}
10565
10566static void btrfs_swap_deactivate(struct file *file)
10567{
10568	struct inode *inode = file_inode(file);
10569
10570	btrfs_free_swapfile_pins(inode);
10571	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10572}
10573
10574static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10575			       sector_t *span)
10576{
10577	struct inode *inode = file_inode(file);
10578	struct btrfs_root *root = BTRFS_I(inode)->root;
10579	struct btrfs_fs_info *fs_info = root->fs_info;
10580	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10581	struct extent_state *cached_state = NULL;
10582	struct extent_map *em = NULL;
10583	struct btrfs_device *device = NULL;
10584	struct btrfs_swap_info bsi = {
10585		.lowest_ppage = (sector_t)-1ULL,
10586	};
10587	int ret = 0;
10588	u64 isize;
10589	u64 start;
10590
10591	/*
10592	 * If the swap file was just created, make sure delalloc is done. If the
10593	 * file changes again after this, the user is doing something stupid and
10594	 * we don't really care.
10595	 */
10596	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10597	if (ret)
10598		return ret;
10599
10600	/*
10601	 * The inode is locked, so these flags won't change after we check them.
10602	 */
10603	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10604		btrfs_warn(fs_info, "swapfile must not be compressed");
10605		return -EINVAL;
10606	}
10607	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10608		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10609		return -EINVAL;
10610	}
10611	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10612		btrfs_warn(fs_info, "swapfile must not be checksummed");
10613		return -EINVAL;
10614	}
10615
10616	/*
10617	 * Balance or device remove/replace/resize can move stuff around from
10618	 * under us. The exclop protection makes sure they aren't running/won't
10619	 * run concurrently while we are mapping the swap extents, and
10620	 * fs_info->swapfile_pins prevents them from running while the swap
10621	 * file is active and moving the extents. Note that this also prevents
10622	 * a concurrent device add which isn't actually necessary, but it's not
10623	 * really worth the trouble to allow it.
10624	 */
10625	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10626		btrfs_warn(fs_info,
10627	   "cannot activate swapfile while exclusive operation is running");
10628		return -EBUSY;
10629	}
10630
10631	/*
10632	 * Prevent snapshot creation while we are activating the swap file.
10633	 * We do not want to race with snapshot creation. If snapshot creation
10634	 * already started before we bumped nr_swapfiles from 0 to 1 and
10635	 * completes before the first write into the swap file after it is
10636	 * activated, than that write would fallback to COW.
10637	 */
10638	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10639		btrfs_exclop_finish(fs_info);
10640		btrfs_warn(fs_info,
10641	   "cannot activate swapfile because snapshot creation is in progress");
10642		return -EINVAL;
10643	}
10644	/*
10645	 * Snapshots can create extents which require COW even if NODATACOW is
10646	 * set. We use this counter to prevent snapshots. We must increment it
10647	 * before walking the extents because we don't want a concurrent
10648	 * snapshot to run after we've already checked the extents.
10649	 *
10650	 * It is possible that subvolume is marked for deletion but still not
10651	 * removed yet. To prevent this race, we check the root status before
10652	 * activating the swapfile.
10653	 */
10654	spin_lock(&root->root_item_lock);
10655	if (btrfs_root_dead(root)) {
10656		spin_unlock(&root->root_item_lock);
10657
10658		btrfs_exclop_finish(fs_info);
10659		btrfs_warn(fs_info,
10660		"cannot activate swapfile because subvolume %llu is being deleted",
10661			root->root_key.objectid);
10662		return -EPERM;
10663	}
10664	atomic_inc(&root->nr_swapfiles);
10665	spin_unlock(&root->root_item_lock);
10666
10667	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10668
10669	lock_extent(io_tree, 0, isize - 1, &cached_state);
10670	start = 0;
10671	while (start < isize) {
10672		u64 logical_block_start, physical_block_start;
10673		struct btrfs_block_group *bg;
10674		u64 len = isize - start;
10675
10676		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10677		if (IS_ERR(em)) {
10678			ret = PTR_ERR(em);
10679			goto out;
10680		}
10681
10682		if (em->block_start == EXTENT_MAP_HOLE) {
10683			btrfs_warn(fs_info, "swapfile must not have holes");
10684			ret = -EINVAL;
10685			goto out;
10686		}
10687		if (em->block_start == EXTENT_MAP_INLINE) {
10688			/*
10689			 * It's unlikely we'll ever actually find ourselves
10690			 * here, as a file small enough to fit inline won't be
10691			 * big enough to store more than the swap header, but in
10692			 * case something changes in the future, let's catch it
10693			 * here rather than later.
10694			 */
10695			btrfs_warn(fs_info, "swapfile must not be inline");
10696			ret = -EINVAL;
10697			goto out;
10698		}
10699		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10700			btrfs_warn(fs_info, "swapfile must not be compressed");
10701			ret = -EINVAL;
10702			goto out;
10703		}
10704
10705		logical_block_start = em->block_start + (start - em->start);
10706		len = min(len, em->len - (start - em->start));
10707		free_extent_map(em);
10708		em = NULL;
10709
10710		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
10711		if (ret < 0) {
10712			goto out;
10713		} else if (ret) {
10714			ret = 0;
10715		} else {
10716			btrfs_warn(fs_info,
10717				   "swapfile must not be copy-on-write");
10718			ret = -EINVAL;
10719			goto out;
10720		}
10721
10722		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10723		if (IS_ERR(em)) {
10724			ret = PTR_ERR(em);
10725			goto out;
10726		}
10727
10728		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10729			btrfs_warn(fs_info,
10730				   "swapfile must have single data profile");
10731			ret = -EINVAL;
10732			goto out;
10733		}
10734
10735		if (device == NULL) {
10736			device = em->map_lookup->stripes[0].dev;
10737			ret = btrfs_add_swapfile_pin(inode, device, false);
10738			if (ret == 1)
10739				ret = 0;
10740			else if (ret)
10741				goto out;
10742		} else if (device != em->map_lookup->stripes[0].dev) {
10743			btrfs_warn(fs_info, "swapfile must be on one device");
10744			ret = -EINVAL;
10745			goto out;
10746		}
10747
10748		physical_block_start = (em->map_lookup->stripes[0].physical +
10749					(logical_block_start - em->start));
10750		len = min(len, em->len - (logical_block_start - em->start));
10751		free_extent_map(em);
10752		em = NULL;
10753
10754		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10755		if (!bg) {
10756			btrfs_warn(fs_info,
10757			   "could not find block group containing swapfile");
10758			ret = -EINVAL;
10759			goto out;
10760		}
10761
10762		if (!btrfs_inc_block_group_swap_extents(bg)) {
10763			btrfs_warn(fs_info,
10764			   "block group for swapfile at %llu is read-only%s",
10765			   bg->start,
10766			   atomic_read(&fs_info->scrubs_running) ?
10767				       " (scrub running)" : "");
10768			btrfs_put_block_group(bg);
10769			ret = -EINVAL;
10770			goto out;
10771		}
10772
10773		ret = btrfs_add_swapfile_pin(inode, bg, true);
10774		if (ret) {
10775			btrfs_put_block_group(bg);
10776			if (ret == 1)
10777				ret = 0;
10778			else
10779				goto out;
10780		}
10781
10782		if (bsi.block_len &&
10783		    bsi.block_start + bsi.block_len == physical_block_start) {
10784			bsi.block_len += len;
10785		} else {
10786			if (bsi.block_len) {
10787				ret = btrfs_add_swap_extent(sis, &bsi);
10788				if (ret)
10789					goto out;
10790			}
10791			bsi.start = start;
10792			bsi.block_start = physical_block_start;
10793			bsi.block_len = len;
10794		}
10795
10796		start += len;
10797	}
10798
10799	if (bsi.block_len)
10800		ret = btrfs_add_swap_extent(sis, &bsi);
10801
10802out:
10803	if (!IS_ERR_OR_NULL(em))
10804		free_extent_map(em);
10805
10806	unlock_extent(io_tree, 0, isize - 1, &cached_state);
10807
10808	if (ret)
10809		btrfs_swap_deactivate(file);
10810
10811	btrfs_drew_write_unlock(&root->snapshot_lock);
10812
10813	btrfs_exclop_finish(fs_info);
10814
10815	if (ret)
10816		return ret;
10817
10818	if (device)
10819		sis->bdev = device->bdev;
10820	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10821	sis->max = bsi.nr_pages;
10822	sis->pages = bsi.nr_pages - 1;
10823	sis->highest_bit = bsi.nr_pages - 1;
10824	return bsi.nr_extents;
10825}
10826#else
10827static void btrfs_swap_deactivate(struct file *file)
10828{
10829}
10830
10831static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10832			       sector_t *span)
10833{
10834	return -EOPNOTSUPP;
10835}
10836#endif
10837
10838/*
10839 * Update the number of bytes used in the VFS' inode. When we replace extents in
10840 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10841 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10842 * always get a correct value.
10843 */
10844void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10845			      const u64 add_bytes,
10846			      const u64 del_bytes)
10847{
10848	if (add_bytes == del_bytes)
10849		return;
10850
10851	spin_lock(&inode->lock);
10852	if (del_bytes > 0)
10853		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10854	if (add_bytes > 0)
10855		inode_add_bytes(&inode->vfs_inode, add_bytes);
10856	spin_unlock(&inode->lock);
10857}
10858
10859/*
10860 * Verify that there are no ordered extents for a given file range.
10861 *
10862 * @inode:   The target inode.
10863 * @start:   Start offset of the file range, should be sector size aligned.
10864 * @end:     End offset (inclusive) of the file range, its value +1 should be
10865 *           sector size aligned.
10866 *
10867 * This should typically be used for cases where we locked an inode's VFS lock in
10868 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10869 * we have flushed all delalloc in the range, we have waited for all ordered
10870 * extents in the range to complete and finally we have locked the file range in
10871 * the inode's io_tree.
10872 */
10873void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10874{
10875	struct btrfs_root *root = inode->root;
10876	struct btrfs_ordered_extent *ordered;
10877
10878	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10879		return;
10880
10881	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10882	if (ordered) {
10883		btrfs_err(root->fs_info,
10884"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10885			  start, end, btrfs_ino(inode), root->root_key.objectid,
10886			  ordered->file_offset,
10887			  ordered->file_offset + ordered->num_bytes - 1);
10888		btrfs_put_ordered_extent(ordered);
10889	}
10890
10891	ASSERT(ordered == NULL);
10892}
10893
10894static const struct inode_operations btrfs_dir_inode_operations = {
10895	.getattr	= btrfs_getattr,
10896	.lookup		= btrfs_lookup,
10897	.create		= btrfs_create,
10898	.unlink		= btrfs_unlink,
10899	.link		= btrfs_link,
10900	.mkdir		= btrfs_mkdir,
10901	.rmdir		= btrfs_rmdir,
10902	.rename		= btrfs_rename2,
10903	.symlink	= btrfs_symlink,
10904	.setattr	= btrfs_setattr,
10905	.mknod		= btrfs_mknod,
10906	.listxattr	= btrfs_listxattr,
10907	.permission	= btrfs_permission,
10908	.get_inode_acl	= btrfs_get_acl,
10909	.set_acl	= btrfs_set_acl,
10910	.update_time	= btrfs_update_time,
10911	.tmpfile        = btrfs_tmpfile,
10912	.fileattr_get	= btrfs_fileattr_get,
10913	.fileattr_set	= btrfs_fileattr_set,
10914};
10915
10916static const struct file_operations btrfs_dir_file_operations = {
10917	.llseek		= btrfs_dir_llseek,
10918	.read		= generic_read_dir,
10919	.iterate_shared	= btrfs_real_readdir,
10920	.open		= btrfs_opendir,
10921	.unlocked_ioctl	= btrfs_ioctl,
10922#ifdef CONFIG_COMPAT
10923	.compat_ioctl	= btrfs_compat_ioctl,
10924#endif
10925	.release        = btrfs_release_file,
10926	.fsync		= btrfs_sync_file,
10927};
10928
10929/*
10930 * btrfs doesn't support the bmap operation because swapfiles
10931 * use bmap to make a mapping of extents in the file.  They assume
10932 * these extents won't change over the life of the file and they
10933 * use the bmap result to do IO directly to the drive.
10934 *
10935 * the btrfs bmap call would return logical addresses that aren't
10936 * suitable for IO and they also will change frequently as COW
10937 * operations happen.  So, swapfile + btrfs == corruption.
10938 *
10939 * For now we're avoiding this by dropping bmap.
10940 */
10941static const struct address_space_operations btrfs_aops = {
10942	.read_folio	= btrfs_read_folio,
10943	.writepages	= btrfs_writepages,
10944	.readahead	= btrfs_readahead,
10945	.invalidate_folio = btrfs_invalidate_folio,
10946	.release_folio	= btrfs_release_folio,
10947	.migrate_folio	= btrfs_migrate_folio,
10948	.dirty_folio	= filemap_dirty_folio,
10949	.error_remove_page = generic_error_remove_page,
10950	.swap_activate	= btrfs_swap_activate,
10951	.swap_deactivate = btrfs_swap_deactivate,
10952};
10953
10954static const struct inode_operations btrfs_file_inode_operations = {
10955	.getattr	= btrfs_getattr,
10956	.setattr	= btrfs_setattr,
10957	.listxattr      = btrfs_listxattr,
10958	.permission	= btrfs_permission,
10959	.fiemap		= btrfs_fiemap,
10960	.get_inode_acl	= btrfs_get_acl,
10961	.set_acl	= btrfs_set_acl,
10962	.update_time	= btrfs_update_time,
10963	.fileattr_get	= btrfs_fileattr_get,
10964	.fileattr_set	= btrfs_fileattr_set,
10965};
10966static const struct inode_operations btrfs_special_inode_operations = {
10967	.getattr	= btrfs_getattr,
10968	.setattr	= btrfs_setattr,
10969	.permission	= btrfs_permission,
10970	.listxattr	= btrfs_listxattr,
10971	.get_inode_acl	= btrfs_get_acl,
10972	.set_acl	= btrfs_set_acl,
10973	.update_time	= btrfs_update_time,
10974};
10975static const struct inode_operations btrfs_symlink_inode_operations = {
10976	.get_link	= page_get_link,
10977	.getattr	= btrfs_getattr,
10978	.setattr	= btrfs_setattr,
10979	.permission	= btrfs_permission,
10980	.listxattr	= btrfs_listxattr,
10981	.update_time	= btrfs_update_time,
10982};
10983
10984const struct dentry_operations btrfs_dentry_operations = {
10985	.d_delete	= btrfs_dentry_delete,
10986};
10987