xref: /kernel/linux/linux-6.6/fs/btrfs/file.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/pagemap.h>
8#include <linux/time.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/backing-dev.h>
12#include <linux/falloc.h>
13#include <linux/writeback.h>
14#include <linux/compat.h>
15#include <linux/slab.h>
16#include <linux/btrfs.h>
17#include <linux/uio.h>
18#include <linux/iversion.h>
19#include <linux/fsverity.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "btrfs_inode.h"
24#include "print-tree.h"
25#include "tree-log.h"
26#include "locking.h"
27#include "volumes.h"
28#include "qgroup.h"
29#include "compression.h"
30#include "delalloc-space.h"
31#include "reflink.h"
32#include "subpage.h"
33#include "fs.h"
34#include "accessors.h"
35#include "extent-tree.h"
36#include "file-item.h"
37#include "ioctl.h"
38#include "file.h"
39#include "super.h"
40
41/* simple helper to fault in pages and copy.  This should go away
42 * and be replaced with calls into generic code.
43 */
44static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
45					 struct page **prepared_pages,
46					 struct iov_iter *i)
47{
48	size_t copied = 0;
49	size_t total_copied = 0;
50	int pg = 0;
51	int offset = offset_in_page(pos);
52
53	while (write_bytes > 0) {
54		size_t count = min_t(size_t,
55				     PAGE_SIZE - offset, write_bytes);
56		struct page *page = prepared_pages[pg];
57		/*
58		 * Copy data from userspace to the current page
59		 */
60		copied = copy_page_from_iter_atomic(page, offset, count, i);
61
62		/* Flush processor's dcache for this page */
63		flush_dcache_page(page);
64
65		/*
66		 * if we get a partial write, we can end up with
67		 * partially up to date pages.  These add
68		 * a lot of complexity, so make sure they don't
69		 * happen by forcing this copy to be retried.
70		 *
71		 * The rest of the btrfs_file_write code will fall
72		 * back to page at a time copies after we return 0.
73		 */
74		if (unlikely(copied < count)) {
75			if (!PageUptodate(page)) {
76				iov_iter_revert(i, copied);
77				copied = 0;
78			}
79			if (!copied)
80				break;
81		}
82
83		write_bytes -= copied;
84		total_copied += copied;
85		offset += copied;
86		if (offset == PAGE_SIZE) {
87			pg++;
88			offset = 0;
89		}
90	}
91	return total_copied;
92}
93
94/*
95 * unlocks pages after btrfs_file_write is done with them
96 */
97static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
98			     struct page **pages, size_t num_pages,
99			     u64 pos, u64 copied)
100{
101	size_t i;
102	u64 block_start = round_down(pos, fs_info->sectorsize);
103	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
104
105	ASSERT(block_len <= U32_MAX);
106	for (i = 0; i < num_pages; i++) {
107		/* page checked is some magic around finding pages that
108		 * have been modified without going through btrfs_set_page_dirty
109		 * clear it here. There should be no need to mark the pages
110		 * accessed as prepare_pages should have marked them accessed
111		 * in prepare_pages via find_or_create_page()
112		 */
113		btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
114					       block_len);
115		unlock_page(pages[i]);
116		put_page(pages[i]);
117	}
118}
119
120/*
121 * After btrfs_copy_from_user(), update the following things for delalloc:
122 * - Mark newly dirtied pages as DELALLOC in the io tree.
123 *   Used to advise which range is to be written back.
124 * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
125 * - Update inode size for past EOF write
126 */
127int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
128		      size_t num_pages, loff_t pos, size_t write_bytes,
129		      struct extent_state **cached, bool noreserve)
130{
131	struct btrfs_fs_info *fs_info = inode->root->fs_info;
132	int err = 0;
133	int i;
134	u64 num_bytes;
135	u64 start_pos;
136	u64 end_of_last_block;
137	u64 end_pos = pos + write_bytes;
138	loff_t isize = i_size_read(&inode->vfs_inode);
139	unsigned int extra_bits = 0;
140
141	if (write_bytes == 0)
142		return 0;
143
144	if (noreserve)
145		extra_bits |= EXTENT_NORESERVE;
146
147	start_pos = round_down(pos, fs_info->sectorsize);
148	num_bytes = round_up(write_bytes + pos - start_pos,
149			     fs_info->sectorsize);
150	ASSERT(num_bytes <= U32_MAX);
151
152	end_of_last_block = start_pos + num_bytes - 1;
153
154	/*
155	 * The pages may have already been dirty, clear out old accounting so
156	 * we can set things up properly
157	 */
158	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
159			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
160			 cached);
161
162	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
163					extra_bits, cached);
164	if (err)
165		return err;
166
167	for (i = 0; i < num_pages; i++) {
168		struct page *p = pages[i];
169
170		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
171		btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
172		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
173	}
174
175	/*
176	 * we've only changed i_size in ram, and we haven't updated
177	 * the disk i_size.  There is no need to log the inode
178	 * at this time.
179	 */
180	if (end_pos > isize)
181		i_size_write(&inode->vfs_inode, end_pos);
182	return 0;
183}
184
185/*
186 * this is very complex, but the basic idea is to drop all extents
187 * in the range start - end.  hint_block is filled in with a block number
188 * that would be a good hint to the block allocator for this file.
189 *
190 * If an extent intersects the range but is not entirely inside the range
191 * it is either truncated or split.  Anything entirely inside the range
192 * is deleted from the tree.
193 *
194 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
195 * to deal with that. We set the field 'bytes_found' of the arguments structure
196 * with the number of allocated bytes found in the target range, so that the
197 * caller can update the inode's number of bytes in an atomic way when
198 * replacing extents in a range to avoid races with stat(2).
199 */
200int btrfs_drop_extents(struct btrfs_trans_handle *trans,
201		       struct btrfs_root *root, struct btrfs_inode *inode,
202		       struct btrfs_drop_extents_args *args)
203{
204	struct btrfs_fs_info *fs_info = root->fs_info;
205	struct extent_buffer *leaf;
206	struct btrfs_file_extent_item *fi;
207	struct btrfs_ref ref = { 0 };
208	struct btrfs_key key;
209	struct btrfs_key new_key;
210	u64 ino = btrfs_ino(inode);
211	u64 search_start = args->start;
212	u64 disk_bytenr = 0;
213	u64 num_bytes = 0;
214	u64 extent_offset = 0;
215	u64 extent_end = 0;
216	u64 last_end = args->start;
217	int del_nr = 0;
218	int del_slot = 0;
219	int extent_type;
220	int recow;
221	int ret;
222	int modify_tree = -1;
223	int update_refs;
224	int found = 0;
225	struct btrfs_path *path = args->path;
226
227	args->bytes_found = 0;
228	args->extent_inserted = false;
229
230	/* Must always have a path if ->replace_extent is true */
231	ASSERT(!(args->replace_extent && !args->path));
232
233	if (!path) {
234		path = btrfs_alloc_path();
235		if (!path) {
236			ret = -ENOMEM;
237			goto out;
238		}
239	}
240
241	if (args->drop_cache)
242		btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
243
244	if (args->start >= inode->disk_i_size && !args->replace_extent)
245		modify_tree = 0;
246
247	update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
248	while (1) {
249		recow = 0;
250		ret = btrfs_lookup_file_extent(trans, root, path, ino,
251					       search_start, modify_tree);
252		if (ret < 0)
253			break;
254		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
255			leaf = path->nodes[0];
256			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
257			if (key.objectid == ino &&
258			    key.type == BTRFS_EXTENT_DATA_KEY)
259				path->slots[0]--;
260		}
261		ret = 0;
262next_slot:
263		leaf = path->nodes[0];
264		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
265			BUG_ON(del_nr > 0);
266			ret = btrfs_next_leaf(root, path);
267			if (ret < 0)
268				break;
269			if (ret > 0) {
270				ret = 0;
271				break;
272			}
273			leaf = path->nodes[0];
274			recow = 1;
275		}
276
277		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
278
279		if (key.objectid > ino)
280			break;
281		if (WARN_ON_ONCE(key.objectid < ino) ||
282		    key.type < BTRFS_EXTENT_DATA_KEY) {
283			ASSERT(del_nr == 0);
284			path->slots[0]++;
285			goto next_slot;
286		}
287		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
288			break;
289
290		fi = btrfs_item_ptr(leaf, path->slots[0],
291				    struct btrfs_file_extent_item);
292		extent_type = btrfs_file_extent_type(leaf, fi);
293
294		if (extent_type == BTRFS_FILE_EXTENT_REG ||
295		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
296			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
297			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
298			extent_offset = btrfs_file_extent_offset(leaf, fi);
299			extent_end = key.offset +
300				btrfs_file_extent_num_bytes(leaf, fi);
301		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
302			extent_end = key.offset +
303				btrfs_file_extent_ram_bytes(leaf, fi);
304		} else {
305			/* can't happen */
306			BUG();
307		}
308
309		/*
310		 * Don't skip extent items representing 0 byte lengths. They
311		 * used to be created (bug) if while punching holes we hit
312		 * -ENOSPC condition. So if we find one here, just ensure we
313		 * delete it, otherwise we would insert a new file extent item
314		 * with the same key (offset) as that 0 bytes length file
315		 * extent item in the call to setup_items_for_insert() later
316		 * in this function.
317		 */
318		if (extent_end == key.offset && extent_end >= search_start) {
319			last_end = extent_end;
320			goto delete_extent_item;
321		}
322
323		if (extent_end <= search_start) {
324			path->slots[0]++;
325			goto next_slot;
326		}
327
328		found = 1;
329		search_start = max(key.offset, args->start);
330		if (recow || !modify_tree) {
331			modify_tree = -1;
332			btrfs_release_path(path);
333			continue;
334		}
335
336		/*
337		 *     | - range to drop - |
338		 *  | -------- extent -------- |
339		 */
340		if (args->start > key.offset && args->end < extent_end) {
341			BUG_ON(del_nr > 0);
342			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
343				ret = -EOPNOTSUPP;
344				break;
345			}
346
347			memcpy(&new_key, &key, sizeof(new_key));
348			new_key.offset = args->start;
349			ret = btrfs_duplicate_item(trans, root, path,
350						   &new_key);
351			if (ret == -EAGAIN) {
352				btrfs_release_path(path);
353				continue;
354			}
355			if (ret < 0)
356				break;
357
358			leaf = path->nodes[0];
359			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
360					    struct btrfs_file_extent_item);
361			btrfs_set_file_extent_num_bytes(leaf, fi,
362							args->start - key.offset);
363
364			fi = btrfs_item_ptr(leaf, path->slots[0],
365					    struct btrfs_file_extent_item);
366
367			extent_offset += args->start - key.offset;
368			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
369			btrfs_set_file_extent_num_bytes(leaf, fi,
370							extent_end - args->start);
371			btrfs_mark_buffer_dirty(trans, leaf);
372
373			if (update_refs && disk_bytenr > 0) {
374				btrfs_init_generic_ref(&ref,
375						BTRFS_ADD_DELAYED_REF,
376						disk_bytenr, num_bytes, 0);
377				btrfs_init_data_ref(&ref,
378						root->root_key.objectid,
379						new_key.objectid,
380						args->start - extent_offset,
381						0, false);
382				ret = btrfs_inc_extent_ref(trans, &ref);
383				if (ret) {
384					btrfs_abort_transaction(trans, ret);
385					break;
386				}
387			}
388			key.offset = args->start;
389		}
390		/*
391		 * From here on out we will have actually dropped something, so
392		 * last_end can be updated.
393		 */
394		last_end = extent_end;
395
396		/*
397		 *  | ---- range to drop ----- |
398		 *      | -------- extent -------- |
399		 */
400		if (args->start <= key.offset && args->end < extent_end) {
401			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
402				ret = -EOPNOTSUPP;
403				break;
404			}
405
406			memcpy(&new_key, &key, sizeof(new_key));
407			new_key.offset = args->end;
408			btrfs_set_item_key_safe(trans, path, &new_key);
409
410			extent_offset += args->end - key.offset;
411			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
412			btrfs_set_file_extent_num_bytes(leaf, fi,
413							extent_end - args->end);
414			btrfs_mark_buffer_dirty(trans, leaf);
415			if (update_refs && disk_bytenr > 0)
416				args->bytes_found += args->end - key.offset;
417			break;
418		}
419
420		search_start = extent_end;
421		/*
422		 *       | ---- range to drop ----- |
423		 *  | -------- extent -------- |
424		 */
425		if (args->start > key.offset && args->end >= extent_end) {
426			BUG_ON(del_nr > 0);
427			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
428				ret = -EOPNOTSUPP;
429				break;
430			}
431
432			btrfs_set_file_extent_num_bytes(leaf, fi,
433							args->start - key.offset);
434			btrfs_mark_buffer_dirty(trans, leaf);
435			if (update_refs && disk_bytenr > 0)
436				args->bytes_found += extent_end - args->start;
437			if (args->end == extent_end)
438				break;
439
440			path->slots[0]++;
441			goto next_slot;
442		}
443
444		/*
445		 *  | ---- range to drop ----- |
446		 *    | ------ extent ------ |
447		 */
448		if (args->start <= key.offset && args->end >= extent_end) {
449delete_extent_item:
450			if (del_nr == 0) {
451				del_slot = path->slots[0];
452				del_nr = 1;
453			} else {
454				BUG_ON(del_slot + del_nr != path->slots[0]);
455				del_nr++;
456			}
457
458			if (update_refs &&
459			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
460				args->bytes_found += extent_end - key.offset;
461				extent_end = ALIGN(extent_end,
462						   fs_info->sectorsize);
463			} else if (update_refs && disk_bytenr > 0) {
464				btrfs_init_generic_ref(&ref,
465						BTRFS_DROP_DELAYED_REF,
466						disk_bytenr, num_bytes, 0);
467				btrfs_init_data_ref(&ref,
468						root->root_key.objectid,
469						key.objectid,
470						key.offset - extent_offset, 0,
471						false);
472				ret = btrfs_free_extent(trans, &ref);
473				if (ret) {
474					btrfs_abort_transaction(trans, ret);
475					break;
476				}
477				args->bytes_found += extent_end - key.offset;
478			}
479
480			if (args->end == extent_end)
481				break;
482
483			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
484				path->slots[0]++;
485				goto next_slot;
486			}
487
488			ret = btrfs_del_items(trans, root, path, del_slot,
489					      del_nr);
490			if (ret) {
491				btrfs_abort_transaction(trans, ret);
492				break;
493			}
494
495			del_nr = 0;
496			del_slot = 0;
497
498			btrfs_release_path(path);
499			continue;
500		}
501
502		BUG();
503	}
504
505	if (!ret && del_nr > 0) {
506		/*
507		 * Set path->slots[0] to first slot, so that after the delete
508		 * if items are move off from our leaf to its immediate left or
509		 * right neighbor leafs, we end up with a correct and adjusted
510		 * path->slots[0] for our insertion (if args->replace_extent).
511		 */
512		path->slots[0] = del_slot;
513		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
514		if (ret)
515			btrfs_abort_transaction(trans, ret);
516	}
517
518	leaf = path->nodes[0];
519	/*
520	 * If btrfs_del_items() was called, it might have deleted a leaf, in
521	 * which case it unlocked our path, so check path->locks[0] matches a
522	 * write lock.
523	 */
524	if (!ret && args->replace_extent &&
525	    path->locks[0] == BTRFS_WRITE_LOCK &&
526	    btrfs_leaf_free_space(leaf) >=
527	    sizeof(struct btrfs_item) + args->extent_item_size) {
528
529		key.objectid = ino;
530		key.type = BTRFS_EXTENT_DATA_KEY;
531		key.offset = args->start;
532		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
533			struct btrfs_key slot_key;
534
535			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
536			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
537				path->slots[0]++;
538		}
539		btrfs_setup_item_for_insert(trans, root, path, &key,
540					    args->extent_item_size);
541		args->extent_inserted = true;
542	}
543
544	if (!args->path)
545		btrfs_free_path(path);
546	else if (!args->extent_inserted)
547		btrfs_release_path(path);
548out:
549	args->drop_end = found ? min(args->end, last_end) : args->end;
550
551	return ret;
552}
553
554static int extent_mergeable(struct extent_buffer *leaf, int slot,
555			    u64 objectid, u64 bytenr, u64 orig_offset,
556			    u64 *start, u64 *end)
557{
558	struct btrfs_file_extent_item *fi;
559	struct btrfs_key key;
560	u64 extent_end;
561
562	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
563		return 0;
564
565	btrfs_item_key_to_cpu(leaf, &key, slot);
566	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
567		return 0;
568
569	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
570	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
571	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
572	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
573	    btrfs_file_extent_compression(leaf, fi) ||
574	    btrfs_file_extent_encryption(leaf, fi) ||
575	    btrfs_file_extent_other_encoding(leaf, fi))
576		return 0;
577
578	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
579	if ((*start && *start != key.offset) || (*end && *end != extent_end))
580		return 0;
581
582	*start = key.offset;
583	*end = extent_end;
584	return 1;
585}
586
587/*
588 * Mark extent in the range start - end as written.
589 *
590 * This changes extent type from 'pre-allocated' to 'regular'. If only
591 * part of extent is marked as written, the extent will be split into
592 * two or three.
593 */
594int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
595			      struct btrfs_inode *inode, u64 start, u64 end)
596{
597	struct btrfs_root *root = inode->root;
598	struct extent_buffer *leaf;
599	struct btrfs_path *path;
600	struct btrfs_file_extent_item *fi;
601	struct btrfs_ref ref = { 0 };
602	struct btrfs_key key;
603	struct btrfs_key new_key;
604	u64 bytenr;
605	u64 num_bytes;
606	u64 extent_end;
607	u64 orig_offset;
608	u64 other_start;
609	u64 other_end;
610	u64 split;
611	int del_nr = 0;
612	int del_slot = 0;
613	int recow;
614	int ret = 0;
615	u64 ino = btrfs_ino(inode);
616
617	path = btrfs_alloc_path();
618	if (!path)
619		return -ENOMEM;
620again:
621	recow = 0;
622	split = start;
623	key.objectid = ino;
624	key.type = BTRFS_EXTENT_DATA_KEY;
625	key.offset = split;
626
627	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
628	if (ret < 0)
629		goto out;
630	if (ret > 0 && path->slots[0] > 0)
631		path->slots[0]--;
632
633	leaf = path->nodes[0];
634	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
635	if (key.objectid != ino ||
636	    key.type != BTRFS_EXTENT_DATA_KEY) {
637		ret = -EINVAL;
638		btrfs_abort_transaction(trans, ret);
639		goto out;
640	}
641	fi = btrfs_item_ptr(leaf, path->slots[0],
642			    struct btrfs_file_extent_item);
643	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
644		ret = -EINVAL;
645		btrfs_abort_transaction(trans, ret);
646		goto out;
647	}
648	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
649	if (key.offset > start || extent_end < end) {
650		ret = -EINVAL;
651		btrfs_abort_transaction(trans, ret);
652		goto out;
653	}
654
655	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
656	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
657	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
658	memcpy(&new_key, &key, sizeof(new_key));
659
660	if (start == key.offset && end < extent_end) {
661		other_start = 0;
662		other_end = start;
663		if (extent_mergeable(leaf, path->slots[0] - 1,
664				     ino, bytenr, orig_offset,
665				     &other_start, &other_end)) {
666			new_key.offset = end;
667			btrfs_set_item_key_safe(trans, path, &new_key);
668			fi = btrfs_item_ptr(leaf, path->slots[0],
669					    struct btrfs_file_extent_item);
670			btrfs_set_file_extent_generation(leaf, fi,
671							 trans->transid);
672			btrfs_set_file_extent_num_bytes(leaf, fi,
673							extent_end - end);
674			btrfs_set_file_extent_offset(leaf, fi,
675						     end - orig_offset);
676			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
677					    struct btrfs_file_extent_item);
678			btrfs_set_file_extent_generation(leaf, fi,
679							 trans->transid);
680			btrfs_set_file_extent_num_bytes(leaf, fi,
681							end - other_start);
682			btrfs_mark_buffer_dirty(trans, leaf);
683			goto out;
684		}
685	}
686
687	if (start > key.offset && end == extent_end) {
688		other_start = end;
689		other_end = 0;
690		if (extent_mergeable(leaf, path->slots[0] + 1,
691				     ino, bytenr, orig_offset,
692				     &other_start, &other_end)) {
693			fi = btrfs_item_ptr(leaf, path->slots[0],
694					    struct btrfs_file_extent_item);
695			btrfs_set_file_extent_num_bytes(leaf, fi,
696							start - key.offset);
697			btrfs_set_file_extent_generation(leaf, fi,
698							 trans->transid);
699			path->slots[0]++;
700			new_key.offset = start;
701			btrfs_set_item_key_safe(trans, path, &new_key);
702
703			fi = btrfs_item_ptr(leaf, path->slots[0],
704					    struct btrfs_file_extent_item);
705			btrfs_set_file_extent_generation(leaf, fi,
706							 trans->transid);
707			btrfs_set_file_extent_num_bytes(leaf, fi,
708							other_end - start);
709			btrfs_set_file_extent_offset(leaf, fi,
710						     start - orig_offset);
711			btrfs_mark_buffer_dirty(trans, leaf);
712			goto out;
713		}
714	}
715
716	while (start > key.offset || end < extent_end) {
717		if (key.offset == start)
718			split = end;
719
720		new_key.offset = split;
721		ret = btrfs_duplicate_item(trans, root, path, &new_key);
722		if (ret == -EAGAIN) {
723			btrfs_release_path(path);
724			goto again;
725		}
726		if (ret < 0) {
727			btrfs_abort_transaction(trans, ret);
728			goto out;
729		}
730
731		leaf = path->nodes[0];
732		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
733				    struct btrfs_file_extent_item);
734		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
735		btrfs_set_file_extent_num_bytes(leaf, fi,
736						split - key.offset);
737
738		fi = btrfs_item_ptr(leaf, path->slots[0],
739				    struct btrfs_file_extent_item);
740
741		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
742		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
743		btrfs_set_file_extent_num_bytes(leaf, fi,
744						extent_end - split);
745		btrfs_mark_buffer_dirty(trans, leaf);
746
747		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
748				       num_bytes, 0);
749		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
750				    orig_offset, 0, false);
751		ret = btrfs_inc_extent_ref(trans, &ref);
752		if (ret) {
753			btrfs_abort_transaction(trans, ret);
754			goto out;
755		}
756
757		if (split == start) {
758			key.offset = start;
759		} else {
760			if (start != key.offset) {
761				ret = -EINVAL;
762				btrfs_abort_transaction(trans, ret);
763				goto out;
764			}
765			path->slots[0]--;
766			extent_end = end;
767		}
768		recow = 1;
769	}
770
771	other_start = end;
772	other_end = 0;
773	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
774			       num_bytes, 0);
775	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
776			    0, false);
777	if (extent_mergeable(leaf, path->slots[0] + 1,
778			     ino, bytenr, orig_offset,
779			     &other_start, &other_end)) {
780		if (recow) {
781			btrfs_release_path(path);
782			goto again;
783		}
784		extent_end = other_end;
785		del_slot = path->slots[0] + 1;
786		del_nr++;
787		ret = btrfs_free_extent(trans, &ref);
788		if (ret) {
789			btrfs_abort_transaction(trans, ret);
790			goto out;
791		}
792	}
793	other_start = 0;
794	other_end = start;
795	if (extent_mergeable(leaf, path->slots[0] - 1,
796			     ino, bytenr, orig_offset,
797			     &other_start, &other_end)) {
798		if (recow) {
799			btrfs_release_path(path);
800			goto again;
801		}
802		key.offset = other_start;
803		del_slot = path->slots[0];
804		del_nr++;
805		ret = btrfs_free_extent(trans, &ref);
806		if (ret) {
807			btrfs_abort_transaction(trans, ret);
808			goto out;
809		}
810	}
811	if (del_nr == 0) {
812		fi = btrfs_item_ptr(leaf, path->slots[0],
813			   struct btrfs_file_extent_item);
814		btrfs_set_file_extent_type(leaf, fi,
815					   BTRFS_FILE_EXTENT_REG);
816		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
817		btrfs_mark_buffer_dirty(trans, leaf);
818	} else {
819		fi = btrfs_item_ptr(leaf, del_slot - 1,
820			   struct btrfs_file_extent_item);
821		btrfs_set_file_extent_type(leaf, fi,
822					   BTRFS_FILE_EXTENT_REG);
823		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
824		btrfs_set_file_extent_num_bytes(leaf, fi,
825						extent_end - key.offset);
826		btrfs_mark_buffer_dirty(trans, leaf);
827
828		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
829		if (ret < 0) {
830			btrfs_abort_transaction(trans, ret);
831			goto out;
832		}
833	}
834out:
835	btrfs_free_path(path);
836	return ret;
837}
838
839/*
840 * on error we return an unlocked page and the error value
841 * on success we return a locked page and 0
842 */
843static int prepare_uptodate_page(struct inode *inode,
844				 struct page *page, u64 pos,
845				 bool force_uptodate)
846{
847	struct folio *folio = page_folio(page);
848	int ret = 0;
849
850	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
851	    !PageUptodate(page)) {
852		ret = btrfs_read_folio(NULL, folio);
853		if (ret)
854			return ret;
855		lock_page(page);
856		if (!PageUptodate(page)) {
857			unlock_page(page);
858			return -EIO;
859		}
860
861		/*
862		 * Since btrfs_read_folio() will unlock the folio before it
863		 * returns, there is a window where btrfs_release_folio() can be
864		 * called to release the page.  Here we check both inode
865		 * mapping and PagePrivate() to make sure the page was not
866		 * released.
867		 *
868		 * The private flag check is essential for subpage as we need
869		 * to store extra bitmap using page->private.
870		 */
871		if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
872			unlock_page(page);
873			return -EAGAIN;
874		}
875	}
876	return 0;
877}
878
879static fgf_t get_prepare_fgp_flags(bool nowait)
880{
881	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
882
883	if (nowait)
884		fgp_flags |= FGP_NOWAIT;
885
886	return fgp_flags;
887}
888
889static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
890{
891	gfp_t gfp;
892
893	gfp = btrfs_alloc_write_mask(inode->i_mapping);
894	if (nowait) {
895		gfp &= ~__GFP_DIRECT_RECLAIM;
896		gfp |= GFP_NOWAIT;
897	}
898
899	return gfp;
900}
901
902/*
903 * this just gets pages into the page cache and locks them down.
904 */
905static noinline int prepare_pages(struct inode *inode, struct page **pages,
906				  size_t num_pages, loff_t pos,
907				  size_t write_bytes, bool force_uptodate,
908				  bool nowait)
909{
910	int i;
911	unsigned long index = pos >> PAGE_SHIFT;
912	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
913	fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
914	int err = 0;
915	int faili;
916
917	for (i = 0; i < num_pages; i++) {
918again:
919		pages[i] = pagecache_get_page(inode->i_mapping, index + i,
920					      fgp_flags, mask | __GFP_WRITE);
921		if (!pages[i]) {
922			faili = i - 1;
923			if (nowait)
924				err = -EAGAIN;
925			else
926				err = -ENOMEM;
927			goto fail;
928		}
929
930		err = set_page_extent_mapped(pages[i]);
931		if (err < 0) {
932			faili = i;
933			goto fail;
934		}
935
936		if (i == 0)
937			err = prepare_uptodate_page(inode, pages[i], pos,
938						    force_uptodate);
939		if (!err && i == num_pages - 1)
940			err = prepare_uptodate_page(inode, pages[i],
941						    pos + write_bytes, false);
942		if (err) {
943			put_page(pages[i]);
944			if (!nowait && err == -EAGAIN) {
945				err = 0;
946				goto again;
947			}
948			faili = i - 1;
949			goto fail;
950		}
951		wait_on_page_writeback(pages[i]);
952	}
953
954	return 0;
955fail:
956	while (faili >= 0) {
957		unlock_page(pages[faili]);
958		put_page(pages[faili]);
959		faili--;
960	}
961	return err;
962
963}
964
965/*
966 * This function locks the extent and properly waits for data=ordered extents
967 * to finish before allowing the pages to be modified if need.
968 *
969 * The return value:
970 * 1 - the extent is locked
971 * 0 - the extent is not locked, and everything is OK
972 * -EAGAIN - need re-prepare the pages
973 * the other < 0 number - Something wrong happens
974 */
975static noinline int
976lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
977				size_t num_pages, loff_t pos,
978				size_t write_bytes,
979				u64 *lockstart, u64 *lockend, bool nowait,
980				struct extent_state **cached_state)
981{
982	struct btrfs_fs_info *fs_info = inode->root->fs_info;
983	u64 start_pos;
984	u64 last_pos;
985	int i;
986	int ret = 0;
987
988	start_pos = round_down(pos, fs_info->sectorsize);
989	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
990
991	if (start_pos < inode->vfs_inode.i_size) {
992		struct btrfs_ordered_extent *ordered;
993
994		if (nowait) {
995			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
996					     cached_state)) {
997				for (i = 0; i < num_pages; i++) {
998					unlock_page(pages[i]);
999					put_page(pages[i]);
1000					pages[i] = NULL;
1001				}
1002
1003				return -EAGAIN;
1004			}
1005		} else {
1006			lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
1007		}
1008
1009		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1010						     last_pos - start_pos + 1);
1011		if (ordered &&
1012		    ordered->file_offset + ordered->num_bytes > start_pos &&
1013		    ordered->file_offset <= last_pos) {
1014			unlock_extent(&inode->io_tree, start_pos, last_pos,
1015				      cached_state);
1016			for (i = 0; i < num_pages; i++) {
1017				unlock_page(pages[i]);
1018				put_page(pages[i]);
1019			}
1020			btrfs_start_ordered_extent(ordered);
1021			btrfs_put_ordered_extent(ordered);
1022			return -EAGAIN;
1023		}
1024		if (ordered)
1025			btrfs_put_ordered_extent(ordered);
1026
1027		*lockstart = start_pos;
1028		*lockend = last_pos;
1029		ret = 1;
1030	}
1031
1032	/*
1033	 * We should be called after prepare_pages() which should have locked
1034	 * all pages in the range.
1035	 */
1036	for (i = 0; i < num_pages; i++)
1037		WARN_ON(!PageLocked(pages[i]));
1038
1039	return ret;
1040}
1041
1042/*
1043 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1044 *
1045 * @pos:         File offset.
1046 * @write_bytes: The length to write, will be updated to the nocow writeable
1047 *               range.
1048 *
1049 * This function will flush ordered extents in the range to ensure proper
1050 * nocow checks.
1051 *
1052 * Return:
1053 * > 0          If we can nocow, and updates @write_bytes.
1054 *  0           If we can't do a nocow write.
1055 * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
1056 *              root is in progress.
1057 * < 0          If an error happened.
1058 *
1059 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1060 */
1061int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1062			   size_t *write_bytes, bool nowait)
1063{
1064	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1065	struct btrfs_root *root = inode->root;
1066	struct extent_state *cached_state = NULL;
1067	u64 lockstart, lockend;
1068	u64 num_bytes;
1069	int ret;
1070
1071	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1072		return 0;
1073
1074	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1075		return -EAGAIN;
1076
1077	lockstart = round_down(pos, fs_info->sectorsize);
1078	lockend = round_up(pos + *write_bytes,
1079			   fs_info->sectorsize) - 1;
1080	num_bytes = lockend - lockstart + 1;
1081
1082	if (nowait) {
1083		if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1084						  &cached_state)) {
1085			btrfs_drew_write_unlock(&root->snapshot_lock);
1086			return -EAGAIN;
1087		}
1088	} else {
1089		btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1090						   &cached_state);
1091	}
1092	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1093			NULL, NULL, NULL, nowait, false);
1094	if (ret <= 0)
1095		btrfs_drew_write_unlock(&root->snapshot_lock);
1096	else
1097		*write_bytes = min_t(size_t, *write_bytes ,
1098				     num_bytes - pos + lockstart);
1099	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1100
1101	return ret;
1102}
1103
1104void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1105{
1106	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1107}
1108
1109static void update_time_for_write(struct inode *inode)
1110{
1111	struct timespec64 now, ctime;
1112
1113	if (IS_NOCMTIME(inode))
1114		return;
1115
1116	now = current_time(inode);
1117	if (!timespec64_equal(&inode->i_mtime, &now))
1118		inode->i_mtime = now;
1119
1120	ctime = inode_get_ctime(inode);
1121	if (!timespec64_equal(&ctime, &now))
1122		inode_set_ctime_to_ts(inode, now);
1123
1124	if (IS_I_VERSION(inode))
1125		inode_inc_iversion(inode);
1126}
1127
1128static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1129			     size_t count)
1130{
1131	struct file *file = iocb->ki_filp;
1132	struct inode *inode = file_inode(file);
1133	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1134	loff_t pos = iocb->ki_pos;
1135	int ret;
1136	loff_t oldsize;
1137	loff_t start_pos;
1138
1139	/*
1140	 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1141	 * prealloc flags, as without those flags we always have to COW. We will
1142	 * later check if we can really COW into the target range (using
1143	 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1144	 */
1145	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1146	    !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1147		return -EAGAIN;
1148
1149	ret = file_remove_privs(file);
1150	if (ret)
1151		return ret;
1152
1153	/*
1154	 * We reserve space for updating the inode when we reserve space for the
1155	 * extent we are going to write, so we will enospc out there.  We don't
1156	 * need to start yet another transaction to update the inode as we will
1157	 * update the inode when we finish writing whatever data we write.
1158	 */
1159	update_time_for_write(inode);
1160
1161	start_pos = round_down(pos, fs_info->sectorsize);
1162	oldsize = i_size_read(inode);
1163	if (start_pos > oldsize) {
1164		/* Expand hole size to cover write data, preventing empty gap */
1165		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1166
1167		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1168		if (ret)
1169			return ret;
1170	}
1171
1172	return 0;
1173}
1174
1175static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1176					       struct iov_iter *i)
1177{
1178	struct file *file = iocb->ki_filp;
1179	loff_t pos;
1180	struct inode *inode = file_inode(file);
1181	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1182	struct page **pages = NULL;
1183	struct extent_changeset *data_reserved = NULL;
1184	u64 release_bytes = 0;
1185	u64 lockstart;
1186	u64 lockend;
1187	size_t num_written = 0;
1188	int nrptrs;
1189	ssize_t ret;
1190	bool only_release_metadata = false;
1191	bool force_page_uptodate = false;
1192	loff_t old_isize = i_size_read(inode);
1193	unsigned int ilock_flags = 0;
1194	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1195	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1196
1197	if (nowait)
1198		ilock_flags |= BTRFS_ILOCK_TRY;
1199
1200	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1201	if (ret < 0)
1202		return ret;
1203
1204	ret = generic_write_checks(iocb, i);
1205	if (ret <= 0)
1206		goto out;
1207
1208	ret = btrfs_write_check(iocb, i, ret);
1209	if (ret < 0)
1210		goto out;
1211
1212	pos = iocb->ki_pos;
1213	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1214			PAGE_SIZE / (sizeof(struct page *)));
1215	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1216	nrptrs = max(nrptrs, 8);
1217	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1218	if (!pages) {
1219		ret = -ENOMEM;
1220		goto out;
1221	}
1222
1223	while (iov_iter_count(i) > 0) {
1224		struct extent_state *cached_state = NULL;
1225		size_t offset = offset_in_page(pos);
1226		size_t sector_offset;
1227		size_t write_bytes = min(iov_iter_count(i),
1228					 nrptrs * (size_t)PAGE_SIZE -
1229					 offset);
1230		size_t num_pages;
1231		size_t reserve_bytes;
1232		size_t dirty_pages;
1233		size_t copied;
1234		size_t dirty_sectors;
1235		size_t num_sectors;
1236		int extents_locked;
1237
1238		/*
1239		 * Fault pages before locking them in prepare_pages
1240		 * to avoid recursive lock
1241		 */
1242		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1243			ret = -EFAULT;
1244			break;
1245		}
1246
1247		only_release_metadata = false;
1248		sector_offset = pos & (fs_info->sectorsize - 1);
1249
1250		extent_changeset_release(data_reserved);
1251		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1252						  &data_reserved, pos,
1253						  write_bytes, nowait);
1254		if (ret < 0) {
1255			int can_nocow;
1256
1257			if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1258				ret = -EAGAIN;
1259				break;
1260			}
1261
1262			/*
1263			 * If we don't have to COW at the offset, reserve
1264			 * metadata only. write_bytes may get smaller than
1265			 * requested here.
1266			 */
1267			can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1268							   &write_bytes, nowait);
1269			if (can_nocow < 0)
1270				ret = can_nocow;
1271			if (can_nocow > 0)
1272				ret = 0;
1273			if (ret)
1274				break;
1275			only_release_metadata = true;
1276		}
1277
1278		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1279		WARN_ON(num_pages > nrptrs);
1280		reserve_bytes = round_up(write_bytes + sector_offset,
1281					 fs_info->sectorsize);
1282		WARN_ON(reserve_bytes == 0);
1283		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1284						      reserve_bytes,
1285						      reserve_bytes, nowait);
1286		if (ret) {
1287			if (!only_release_metadata)
1288				btrfs_free_reserved_data_space(BTRFS_I(inode),
1289						data_reserved, pos,
1290						write_bytes);
1291			else
1292				btrfs_check_nocow_unlock(BTRFS_I(inode));
1293
1294			if (nowait && ret == -ENOSPC)
1295				ret = -EAGAIN;
1296			break;
1297		}
1298
1299		release_bytes = reserve_bytes;
1300again:
1301		ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1302		if (ret) {
1303			btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1304			break;
1305		}
1306
1307		/*
1308		 * This is going to setup the pages array with the number of
1309		 * pages we want, so we don't really need to worry about the
1310		 * contents of pages from loop to loop
1311		 */
1312		ret = prepare_pages(inode, pages, num_pages,
1313				    pos, write_bytes, force_page_uptodate, false);
1314		if (ret) {
1315			btrfs_delalloc_release_extents(BTRFS_I(inode),
1316						       reserve_bytes);
1317			break;
1318		}
1319
1320		extents_locked = lock_and_cleanup_extent_if_need(
1321				BTRFS_I(inode), pages,
1322				num_pages, pos, write_bytes, &lockstart,
1323				&lockend, nowait, &cached_state);
1324		if (extents_locked < 0) {
1325			if (!nowait && extents_locked == -EAGAIN)
1326				goto again;
1327
1328			btrfs_delalloc_release_extents(BTRFS_I(inode),
1329						       reserve_bytes);
1330			ret = extents_locked;
1331			break;
1332		}
1333
1334		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1335
1336		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1337		dirty_sectors = round_up(copied + sector_offset,
1338					fs_info->sectorsize);
1339		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1340
1341		/*
1342		 * if we have trouble faulting in the pages, fall
1343		 * back to one page at a time
1344		 */
1345		if (copied < write_bytes)
1346			nrptrs = 1;
1347
1348		if (copied == 0) {
1349			force_page_uptodate = true;
1350			dirty_sectors = 0;
1351			dirty_pages = 0;
1352		} else {
1353			force_page_uptodate = false;
1354			dirty_pages = DIV_ROUND_UP(copied + offset,
1355						   PAGE_SIZE);
1356		}
1357
1358		if (num_sectors > dirty_sectors) {
1359			/* release everything except the sectors we dirtied */
1360			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1361			if (only_release_metadata) {
1362				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1363							release_bytes, true);
1364			} else {
1365				u64 __pos;
1366
1367				__pos = round_down(pos,
1368						   fs_info->sectorsize) +
1369					(dirty_pages << PAGE_SHIFT);
1370				btrfs_delalloc_release_space(BTRFS_I(inode),
1371						data_reserved, __pos,
1372						release_bytes, true);
1373			}
1374		}
1375
1376		release_bytes = round_up(copied + sector_offset,
1377					fs_info->sectorsize);
1378
1379		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1380					dirty_pages, pos, copied,
1381					&cached_state, only_release_metadata);
1382
1383		/*
1384		 * If we have not locked the extent range, because the range's
1385		 * start offset is >= i_size, we might still have a non-NULL
1386		 * cached extent state, acquired while marking the extent range
1387		 * as delalloc through btrfs_dirty_pages(). Therefore free any
1388		 * possible cached extent state to avoid a memory leak.
1389		 */
1390		if (extents_locked)
1391			unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1392				      lockend, &cached_state);
1393		else
1394			free_extent_state(cached_state);
1395
1396		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1397		if (ret) {
1398			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1399			break;
1400		}
1401
1402		release_bytes = 0;
1403		if (only_release_metadata)
1404			btrfs_check_nocow_unlock(BTRFS_I(inode));
1405
1406		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1407
1408		cond_resched();
1409
1410		pos += copied;
1411		num_written += copied;
1412	}
1413
1414	kfree(pages);
1415
1416	if (release_bytes) {
1417		if (only_release_metadata) {
1418			btrfs_check_nocow_unlock(BTRFS_I(inode));
1419			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1420					release_bytes, true);
1421		} else {
1422			btrfs_delalloc_release_space(BTRFS_I(inode),
1423					data_reserved,
1424					round_down(pos, fs_info->sectorsize),
1425					release_bytes, true);
1426		}
1427	}
1428
1429	extent_changeset_free(data_reserved);
1430	if (num_written > 0) {
1431		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1432		iocb->ki_pos += num_written;
1433	}
1434out:
1435	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1436	return num_written ? num_written : ret;
1437}
1438
1439static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1440			       const struct iov_iter *iter, loff_t offset)
1441{
1442	const u32 blocksize_mask = fs_info->sectorsize - 1;
1443
1444	if (offset & blocksize_mask)
1445		return -EINVAL;
1446
1447	if (iov_iter_alignment(iter) & blocksize_mask)
1448		return -EINVAL;
1449
1450	return 0;
1451}
1452
1453static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1454{
1455	struct file *file = iocb->ki_filp;
1456	struct inode *inode = file_inode(file);
1457	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1458	loff_t pos;
1459	ssize_t written = 0;
1460	ssize_t written_buffered;
1461	size_t prev_left = 0;
1462	loff_t endbyte;
1463	ssize_t err;
1464	unsigned int ilock_flags = 0;
1465	struct iomap_dio *dio;
1466
1467	if (iocb->ki_flags & IOCB_NOWAIT)
1468		ilock_flags |= BTRFS_ILOCK_TRY;
1469
1470	/*
1471	 * If the write DIO is within EOF, use a shared lock and also only if
1472	 * security bits will likely not be dropped by file_remove_privs() called
1473	 * from btrfs_write_check(). Either will need to be rechecked after the
1474	 * lock was acquired.
1475	 */
1476	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
1477		ilock_flags |= BTRFS_ILOCK_SHARED;
1478
1479relock:
1480	err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1481	if (err < 0)
1482		return err;
1483
1484	/* Shared lock cannot be used with security bits set. */
1485	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1486		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1487		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1488		goto relock;
1489	}
1490
1491	err = generic_write_checks(iocb, from);
1492	if (err <= 0) {
1493		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1494		return err;
1495	}
1496
1497	err = btrfs_write_check(iocb, from, err);
1498	if (err < 0) {
1499		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1500		goto out;
1501	}
1502
1503	pos = iocb->ki_pos;
1504	/*
1505	 * Re-check since file size may have changed just before taking the
1506	 * lock or pos may have changed because of O_APPEND in generic_write_check()
1507	 */
1508	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1509	    pos + iov_iter_count(from) > i_size_read(inode)) {
1510		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1511		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1512		goto relock;
1513	}
1514
1515	if (check_direct_IO(fs_info, from, pos)) {
1516		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1517		goto buffered;
1518	}
1519
1520	/*
1521	 * The iov_iter can be mapped to the same file range we are writing to.
1522	 * If that's the case, then we will deadlock in the iomap code, because
1523	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
1524	 * an ordered extent, and after that it will fault in the pages that the
1525	 * iov_iter refers to. During the fault in we end up in the readahead
1526	 * pages code (starting at btrfs_readahead()), which will lock the range,
1527	 * find that ordered extent and then wait for it to complete (at
1528	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1529	 * obviously the ordered extent can never complete as we didn't submit
1530	 * yet the respective bio(s). This always happens when the buffer is
1531	 * memory mapped to the same file range, since the iomap DIO code always
1532	 * invalidates pages in the target file range (after starting and waiting
1533	 * for any writeback).
1534	 *
1535	 * So here we disable page faults in the iov_iter and then retry if we
1536	 * got -EFAULT, faulting in the pages before the retry.
1537	 */
1538	from->nofault = true;
1539	dio = btrfs_dio_write(iocb, from, written);
1540	from->nofault = false;
1541
1542	/*
1543	 * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1544	 * iocb, and that needs to lock the inode. So unlock it before calling
1545	 * iomap_dio_complete() to avoid a deadlock.
1546	 */
1547	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1548
1549	if (IS_ERR_OR_NULL(dio))
1550		err = PTR_ERR_OR_ZERO(dio);
1551	else
1552		err = iomap_dio_complete(dio);
1553
1554	/* No increment (+=) because iomap returns a cumulative value. */
1555	if (err > 0)
1556		written = err;
1557
1558	if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
1559		const size_t left = iov_iter_count(from);
1560		/*
1561		 * We have more data left to write. Try to fault in as many as
1562		 * possible of the remainder pages and retry. We do this without
1563		 * releasing and locking again the inode, to prevent races with
1564		 * truncate.
1565		 *
1566		 * Also, in case the iov refers to pages in the file range of the
1567		 * file we want to write to (due to a mmap), we could enter an
1568		 * infinite loop if we retry after faulting the pages in, since
1569		 * iomap will invalidate any pages in the range early on, before
1570		 * it tries to fault in the pages of the iov. So we keep track of
1571		 * how much was left of iov in the previous EFAULT and fallback
1572		 * to buffered IO in case we haven't made any progress.
1573		 */
1574		if (left == prev_left) {
1575			err = -ENOTBLK;
1576		} else {
1577			fault_in_iov_iter_readable(from, left);
1578			prev_left = left;
1579			goto relock;
1580		}
1581	}
1582
1583	/*
1584	 * If 'err' is -ENOTBLK or we have not written all data, then it means
1585	 * we must fallback to buffered IO.
1586	 */
1587	if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
1588		goto out;
1589
1590buffered:
1591	/*
1592	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1593	 * it must retry the operation in a context where blocking is acceptable,
1594	 * because even if we end up not blocking during the buffered IO attempt
1595	 * below, we will block when flushing and waiting for the IO.
1596	 */
1597	if (iocb->ki_flags & IOCB_NOWAIT) {
1598		err = -EAGAIN;
1599		goto out;
1600	}
1601
1602	pos = iocb->ki_pos;
1603	written_buffered = btrfs_buffered_write(iocb, from);
1604	if (written_buffered < 0) {
1605		err = written_buffered;
1606		goto out;
1607	}
1608	/*
1609	 * Ensure all data is persisted. We want the next direct IO read to be
1610	 * able to read what was just written.
1611	 */
1612	endbyte = pos + written_buffered - 1;
1613	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1614	if (err)
1615		goto out;
1616	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1617	if (err)
1618		goto out;
1619	written += written_buffered;
1620	iocb->ki_pos = pos + written_buffered;
1621	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1622				 endbyte >> PAGE_SHIFT);
1623out:
1624	return err < 0 ? err : written;
1625}
1626
1627static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1628			const struct btrfs_ioctl_encoded_io_args *encoded)
1629{
1630	struct file *file = iocb->ki_filp;
1631	struct inode *inode = file_inode(file);
1632	loff_t count;
1633	ssize_t ret;
1634
1635	btrfs_inode_lock(BTRFS_I(inode), 0);
1636	count = encoded->len;
1637	ret = generic_write_checks_count(iocb, &count);
1638	if (ret == 0 && count != encoded->len) {
1639		/*
1640		 * The write got truncated by generic_write_checks_count(). We
1641		 * can't do a partial encoded write.
1642		 */
1643		ret = -EFBIG;
1644	}
1645	if (ret || encoded->len == 0)
1646		goto out;
1647
1648	ret = btrfs_write_check(iocb, from, encoded->len);
1649	if (ret < 0)
1650		goto out;
1651
1652	ret = btrfs_do_encoded_write(iocb, from, encoded);
1653out:
1654	btrfs_inode_unlock(BTRFS_I(inode), 0);
1655	return ret;
1656}
1657
1658ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1659			    const struct btrfs_ioctl_encoded_io_args *encoded)
1660{
1661	struct file *file = iocb->ki_filp;
1662	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1663	ssize_t num_written, num_sync;
1664
1665	/*
1666	 * If the fs flips readonly due to some impossible error, although we
1667	 * have opened a file as writable, we have to stop this write operation
1668	 * to ensure consistency.
1669	 */
1670	if (BTRFS_FS_ERROR(inode->root->fs_info))
1671		return -EROFS;
1672
1673	if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1674		return -EOPNOTSUPP;
1675
1676	if (encoded) {
1677		num_written = btrfs_encoded_write(iocb, from, encoded);
1678		num_sync = encoded->len;
1679	} else if (iocb->ki_flags & IOCB_DIRECT) {
1680		num_written = btrfs_direct_write(iocb, from);
1681		num_sync = num_written;
1682	} else {
1683		num_written = btrfs_buffered_write(iocb, from);
1684		num_sync = num_written;
1685	}
1686
1687	btrfs_set_inode_last_sub_trans(inode);
1688
1689	if (num_sync > 0) {
1690		num_sync = generic_write_sync(iocb, num_sync);
1691		if (num_sync < 0)
1692			num_written = num_sync;
1693	}
1694
1695	return num_written;
1696}
1697
1698static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1699{
1700	return btrfs_do_write_iter(iocb, from, NULL);
1701}
1702
1703int btrfs_release_file(struct inode *inode, struct file *filp)
1704{
1705	struct btrfs_file_private *private = filp->private_data;
1706
1707	if (private) {
1708		kfree(private->filldir_buf);
1709		free_extent_state(private->llseek_cached_state);
1710		kfree(private);
1711		filp->private_data = NULL;
1712	}
1713
1714	/*
1715	 * Set by setattr when we are about to truncate a file from a non-zero
1716	 * size to a zero size.  This tries to flush down new bytes that may
1717	 * have been written if the application were using truncate to replace
1718	 * a file in place.
1719	 */
1720	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1721			       &BTRFS_I(inode)->runtime_flags))
1722			filemap_flush(inode->i_mapping);
1723	return 0;
1724}
1725
1726static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1727{
1728	int ret;
1729	struct blk_plug plug;
1730
1731	/*
1732	 * This is only called in fsync, which would do synchronous writes, so
1733	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1734	 * multiple disks using raid profile, a large IO can be split to
1735	 * several segments of stripe length (currently 64K).
1736	 */
1737	blk_start_plug(&plug);
1738	ret = btrfs_fdatawrite_range(inode, start, end);
1739	blk_finish_plug(&plug);
1740
1741	return ret;
1742}
1743
1744static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1745{
1746	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
1747	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1748
1749	if (btrfs_inode_in_log(inode, fs_info->generation) &&
1750	    list_empty(&ctx->ordered_extents))
1751		return true;
1752
1753	/*
1754	 * If we are doing a fast fsync we can not bail out if the inode's
1755	 * last_trans is <= then the last committed transaction, because we only
1756	 * update the last_trans of the inode during ordered extent completion,
1757	 * and for a fast fsync we don't wait for that, we only wait for the
1758	 * writeback to complete.
1759	 */
1760	if (inode->last_trans <= fs_info->last_trans_committed &&
1761	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1762	     list_empty(&ctx->ordered_extents)))
1763		return true;
1764
1765	return false;
1766}
1767
1768/*
1769 * fsync call for both files and directories.  This logs the inode into
1770 * the tree log instead of forcing full commits whenever possible.
1771 *
1772 * It needs to call filemap_fdatawait so that all ordered extent updates are
1773 * in the metadata btree are up to date for copying to the log.
1774 *
1775 * It drops the inode mutex before doing the tree log commit.  This is an
1776 * important optimization for directories because holding the mutex prevents
1777 * new operations on the dir while we write to disk.
1778 */
1779int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1780{
1781	struct dentry *dentry = file_dentry(file);
1782	struct inode *inode = d_inode(dentry);
1783	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1784	struct btrfs_root *root = BTRFS_I(inode)->root;
1785	struct btrfs_trans_handle *trans;
1786	struct btrfs_log_ctx ctx;
1787	int ret = 0, err;
1788	u64 len;
1789	bool full_sync;
1790
1791	trace_btrfs_sync_file(file, datasync);
1792
1793	btrfs_init_log_ctx(&ctx, inode);
1794
1795	/*
1796	 * Always set the range to a full range, otherwise we can get into
1797	 * several problems, from missing file extent items to represent holes
1798	 * when not using the NO_HOLES feature, to log tree corruption due to
1799	 * races between hole detection during logging and completion of ordered
1800	 * extents outside the range, to missing checksums due to ordered extents
1801	 * for which we flushed only a subset of their pages.
1802	 */
1803	start = 0;
1804	end = LLONG_MAX;
1805	len = (u64)LLONG_MAX + 1;
1806
1807	/*
1808	 * We write the dirty pages in the range and wait until they complete
1809	 * out of the ->i_mutex. If so, we can flush the dirty pages by
1810	 * multi-task, and make the performance up.  See
1811	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1812	 */
1813	ret = start_ordered_ops(inode, start, end);
1814	if (ret)
1815		goto out;
1816
1817	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1818
1819	atomic_inc(&root->log_batch);
1820
1821	/*
1822	 * Before we acquired the inode's lock and the mmap lock, someone may
1823	 * have dirtied more pages in the target range. We need to make sure
1824	 * that writeback for any such pages does not start while we are logging
1825	 * the inode, because if it does, any of the following might happen when
1826	 * we are not doing a full inode sync:
1827	 *
1828	 * 1) We log an extent after its writeback finishes but before its
1829	 *    checksums are added to the csum tree, leading to -EIO errors
1830	 *    when attempting to read the extent after a log replay.
1831	 *
1832	 * 2) We can end up logging an extent before its writeback finishes.
1833	 *    Therefore after the log replay we will have a file extent item
1834	 *    pointing to an unwritten extent (and no data checksums as well).
1835	 *
1836	 * So trigger writeback for any eventual new dirty pages and then we
1837	 * wait for all ordered extents to complete below.
1838	 */
1839	ret = start_ordered_ops(inode, start, end);
1840	if (ret) {
1841		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1842		goto out;
1843	}
1844
1845	/*
1846	 * Always check for the full sync flag while holding the inode's lock,
1847	 * to avoid races with other tasks. The flag must be either set all the
1848	 * time during logging or always off all the time while logging.
1849	 * We check the flag here after starting delalloc above, because when
1850	 * running delalloc the full sync flag may be set if we need to drop
1851	 * extra extent map ranges due to temporary memory allocation failures.
1852	 */
1853	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1854			     &BTRFS_I(inode)->runtime_flags);
1855
1856	/*
1857	 * We have to do this here to avoid the priority inversion of waiting on
1858	 * IO of a lower priority task while holding a transaction open.
1859	 *
1860	 * For a full fsync we wait for the ordered extents to complete while
1861	 * for a fast fsync we wait just for writeback to complete, and then
1862	 * attach the ordered extents to the transaction so that a transaction
1863	 * commit waits for their completion, to avoid data loss if we fsync,
1864	 * the current transaction commits before the ordered extents complete
1865	 * and a power failure happens right after that.
1866	 *
1867	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1868	 * logical address recorded in the ordered extent may change. We need
1869	 * to wait for the IO to stabilize the logical address.
1870	 */
1871	if (full_sync || btrfs_is_zoned(fs_info)) {
1872		ret = btrfs_wait_ordered_range(inode, start, len);
1873	} else {
1874		/*
1875		 * Get our ordered extents as soon as possible to avoid doing
1876		 * checksum lookups in the csum tree, and use instead the
1877		 * checksums attached to the ordered extents.
1878		 */
1879		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
1880						      &ctx.ordered_extents);
1881		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
1882	}
1883
1884	if (ret)
1885		goto out_release_extents;
1886
1887	atomic_inc(&root->log_batch);
1888
1889	smp_mb();
1890	if (skip_inode_logging(&ctx)) {
1891		/*
1892		 * We've had everything committed since the last time we were
1893		 * modified so clear this flag in case it was set for whatever
1894		 * reason, it's no longer relevant.
1895		 */
1896		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1897			  &BTRFS_I(inode)->runtime_flags);
1898		/*
1899		 * An ordered extent might have started before and completed
1900		 * already with io errors, in which case the inode was not
1901		 * updated and we end up here. So check the inode's mapping
1902		 * for any errors that might have happened since we last
1903		 * checked called fsync.
1904		 */
1905		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
1906		goto out_release_extents;
1907	}
1908
1909	/*
1910	 * We use start here because we will need to wait on the IO to complete
1911	 * in btrfs_sync_log, which could require joining a transaction (for
1912	 * example checking cross references in the nocow path).  If we use join
1913	 * here we could get into a situation where we're waiting on IO to
1914	 * happen that is blocked on a transaction trying to commit.  With start
1915	 * we inc the extwriter counter, so we wait for all extwriters to exit
1916	 * before we start blocking joiners.  This comment is to keep somebody
1917	 * from thinking they are super smart and changing this to
1918	 * btrfs_join_transaction *cough*Josef*cough*.
1919	 */
1920	trans = btrfs_start_transaction(root, 0);
1921	if (IS_ERR(trans)) {
1922		ret = PTR_ERR(trans);
1923		goto out_release_extents;
1924	}
1925	trans->in_fsync = true;
1926
1927	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1928	btrfs_release_log_ctx_extents(&ctx);
1929	if (ret < 0) {
1930		/* Fallthrough and commit/free transaction. */
1931		ret = BTRFS_LOG_FORCE_COMMIT;
1932	}
1933
1934	/* we've logged all the items and now have a consistent
1935	 * version of the file in the log.  It is possible that
1936	 * someone will come in and modify the file, but that's
1937	 * fine because the log is consistent on disk, and we
1938	 * have references to all of the file's extents
1939	 *
1940	 * It is possible that someone will come in and log the
1941	 * file again, but that will end up using the synchronization
1942	 * inside btrfs_sync_log to keep things safe.
1943	 */
1944	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1945
1946	if (ret == BTRFS_NO_LOG_SYNC) {
1947		ret = btrfs_end_transaction(trans);
1948		goto out;
1949	}
1950
1951	/* We successfully logged the inode, attempt to sync the log. */
1952	if (!ret) {
1953		ret = btrfs_sync_log(trans, root, &ctx);
1954		if (!ret) {
1955			ret = btrfs_end_transaction(trans);
1956			goto out;
1957		}
1958	}
1959
1960	/*
1961	 * At this point we need to commit the transaction because we had
1962	 * btrfs_need_log_full_commit() or some other error.
1963	 *
1964	 * If we didn't do a full sync we have to stop the trans handle, wait on
1965	 * the ordered extents, start it again and commit the transaction.  If
1966	 * we attempt to wait on the ordered extents here we could deadlock with
1967	 * something like fallocate() that is holding the extent lock trying to
1968	 * start a transaction while some other thread is trying to commit the
1969	 * transaction while we (fsync) are currently holding the transaction
1970	 * open.
1971	 */
1972	if (!full_sync) {
1973		ret = btrfs_end_transaction(trans);
1974		if (ret)
1975			goto out;
1976		ret = btrfs_wait_ordered_range(inode, start, len);
1977		if (ret)
1978			goto out;
1979
1980		/*
1981		 * This is safe to use here because we're only interested in
1982		 * making sure the transaction that had the ordered extents is
1983		 * committed.  We aren't waiting on anything past this point,
1984		 * we're purely getting the transaction and committing it.
1985		 */
1986		trans = btrfs_attach_transaction_barrier(root);
1987		if (IS_ERR(trans)) {
1988			ret = PTR_ERR(trans);
1989
1990			/*
1991			 * We committed the transaction and there's no currently
1992			 * running transaction, this means everything we care
1993			 * about made it to disk and we are done.
1994			 */
1995			if (ret == -ENOENT)
1996				ret = 0;
1997			goto out;
1998		}
1999	}
2000
2001	ret = btrfs_commit_transaction(trans);
2002out:
2003	ASSERT(list_empty(&ctx.list));
2004	ASSERT(list_empty(&ctx.conflict_inodes));
2005	err = file_check_and_advance_wb_err(file);
2006	if (!ret)
2007		ret = err;
2008	return ret > 0 ? -EIO : ret;
2009
2010out_release_extents:
2011	btrfs_release_log_ctx_extents(&ctx);
2012	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2013	goto out;
2014}
2015
2016static const struct vm_operations_struct btrfs_file_vm_ops = {
2017	.fault		= filemap_fault,
2018	.map_pages	= filemap_map_pages,
2019	.page_mkwrite	= btrfs_page_mkwrite,
2020};
2021
2022static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2023{
2024	struct address_space *mapping = filp->f_mapping;
2025
2026	if (!mapping->a_ops->read_folio)
2027		return -ENOEXEC;
2028
2029	file_accessed(filp);
2030	vma->vm_ops = &btrfs_file_vm_ops;
2031
2032	return 0;
2033}
2034
2035static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2036			  int slot, u64 start, u64 end)
2037{
2038	struct btrfs_file_extent_item *fi;
2039	struct btrfs_key key;
2040
2041	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2042		return 0;
2043
2044	btrfs_item_key_to_cpu(leaf, &key, slot);
2045	if (key.objectid != btrfs_ino(inode) ||
2046	    key.type != BTRFS_EXTENT_DATA_KEY)
2047		return 0;
2048
2049	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2050
2051	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2052		return 0;
2053
2054	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2055		return 0;
2056
2057	if (key.offset == end)
2058		return 1;
2059	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2060		return 1;
2061	return 0;
2062}
2063
2064static int fill_holes(struct btrfs_trans_handle *trans,
2065		struct btrfs_inode *inode,
2066		struct btrfs_path *path, u64 offset, u64 end)
2067{
2068	struct btrfs_fs_info *fs_info = trans->fs_info;
2069	struct btrfs_root *root = inode->root;
2070	struct extent_buffer *leaf;
2071	struct btrfs_file_extent_item *fi;
2072	struct extent_map *hole_em;
2073	struct btrfs_key key;
2074	int ret;
2075
2076	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2077		goto out;
2078
2079	key.objectid = btrfs_ino(inode);
2080	key.type = BTRFS_EXTENT_DATA_KEY;
2081	key.offset = offset;
2082
2083	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2084	if (ret <= 0) {
2085		/*
2086		 * We should have dropped this offset, so if we find it then
2087		 * something has gone horribly wrong.
2088		 */
2089		if (ret == 0)
2090			ret = -EINVAL;
2091		return ret;
2092	}
2093
2094	leaf = path->nodes[0];
2095	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2096		u64 num_bytes;
2097
2098		path->slots[0]--;
2099		fi = btrfs_item_ptr(leaf, path->slots[0],
2100				    struct btrfs_file_extent_item);
2101		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2102			end - offset;
2103		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2104		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2105		btrfs_set_file_extent_offset(leaf, fi, 0);
2106		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2107		btrfs_mark_buffer_dirty(trans, leaf);
2108		goto out;
2109	}
2110
2111	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2112		u64 num_bytes;
2113
2114		key.offset = offset;
2115		btrfs_set_item_key_safe(trans, path, &key);
2116		fi = btrfs_item_ptr(leaf, path->slots[0],
2117				    struct btrfs_file_extent_item);
2118		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2119			offset;
2120		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2121		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2122		btrfs_set_file_extent_offset(leaf, fi, 0);
2123		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2124		btrfs_mark_buffer_dirty(trans, leaf);
2125		goto out;
2126	}
2127	btrfs_release_path(path);
2128
2129	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2130				       end - offset);
2131	if (ret)
2132		return ret;
2133
2134out:
2135	btrfs_release_path(path);
2136
2137	hole_em = alloc_extent_map();
2138	if (!hole_em) {
2139		btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2140		btrfs_set_inode_full_sync(inode);
2141	} else {
2142		hole_em->start = offset;
2143		hole_em->len = end - offset;
2144		hole_em->ram_bytes = hole_em->len;
2145		hole_em->orig_start = offset;
2146
2147		hole_em->block_start = EXTENT_MAP_HOLE;
2148		hole_em->block_len = 0;
2149		hole_em->orig_block_len = 0;
2150		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2151		hole_em->generation = trans->transid;
2152
2153		ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2154		free_extent_map(hole_em);
2155		if (ret)
2156			btrfs_set_inode_full_sync(inode);
2157	}
2158
2159	return 0;
2160}
2161
2162/*
2163 * Find a hole extent on given inode and change start/len to the end of hole
2164 * extent.(hole/vacuum extent whose em->start <= start &&
2165 *	   em->start + em->len > start)
2166 * When a hole extent is found, return 1 and modify start/len.
2167 */
2168static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2169{
2170	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2171	struct extent_map *em;
2172	int ret = 0;
2173
2174	em = btrfs_get_extent(inode, NULL, 0,
2175			      round_down(*start, fs_info->sectorsize),
2176			      round_up(*len, fs_info->sectorsize));
2177	if (IS_ERR(em))
2178		return PTR_ERR(em);
2179
2180	/* Hole or vacuum extent(only exists in no-hole mode) */
2181	if (em->block_start == EXTENT_MAP_HOLE) {
2182		ret = 1;
2183		*len = em->start + em->len > *start + *len ?
2184		       0 : *start + *len - em->start - em->len;
2185		*start = em->start + em->len;
2186	}
2187	free_extent_map(em);
2188	return ret;
2189}
2190
2191static void btrfs_punch_hole_lock_range(struct inode *inode,
2192					const u64 lockstart,
2193					const u64 lockend,
2194					struct extent_state **cached_state)
2195{
2196	/*
2197	 * For subpage case, if the range is not at page boundary, we could
2198	 * have pages at the leading/tailing part of the range.
2199	 * This could lead to dead loop since filemap_range_has_page()
2200	 * will always return true.
2201	 * So here we need to do extra page alignment for
2202	 * filemap_range_has_page().
2203	 */
2204	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2205	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2206
2207	while (1) {
2208		truncate_pagecache_range(inode, lockstart, lockend);
2209
2210		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2211			    cached_state);
2212		/*
2213		 * We can't have ordered extents in the range, nor dirty/writeback
2214		 * pages, because we have locked the inode's VFS lock in exclusive
2215		 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2216		 * we have flushed all delalloc in the range and we have waited
2217		 * for any ordered extents in the range to complete.
2218		 * We can race with anyone reading pages from this range, so after
2219		 * locking the range check if we have pages in the range, and if
2220		 * we do, unlock the range and retry.
2221		 */
2222		if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2223					    page_lockend))
2224			break;
2225
2226		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2227			      cached_state);
2228	}
2229
2230	btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2231}
2232
2233static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2234				     struct btrfs_inode *inode,
2235				     struct btrfs_path *path,
2236				     struct btrfs_replace_extent_info *extent_info,
2237				     const u64 replace_len,
2238				     const u64 bytes_to_drop)
2239{
2240	struct btrfs_fs_info *fs_info = trans->fs_info;
2241	struct btrfs_root *root = inode->root;
2242	struct btrfs_file_extent_item *extent;
2243	struct extent_buffer *leaf;
2244	struct btrfs_key key;
2245	int slot;
2246	struct btrfs_ref ref = { 0 };
2247	int ret;
2248
2249	if (replace_len == 0)
2250		return 0;
2251
2252	if (extent_info->disk_offset == 0 &&
2253	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2254		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2255		return 0;
2256	}
2257
2258	key.objectid = btrfs_ino(inode);
2259	key.type = BTRFS_EXTENT_DATA_KEY;
2260	key.offset = extent_info->file_offset;
2261	ret = btrfs_insert_empty_item(trans, root, path, &key,
2262				      sizeof(struct btrfs_file_extent_item));
2263	if (ret)
2264		return ret;
2265	leaf = path->nodes[0];
2266	slot = path->slots[0];
2267	write_extent_buffer(leaf, extent_info->extent_buf,
2268			    btrfs_item_ptr_offset(leaf, slot),
2269			    sizeof(struct btrfs_file_extent_item));
2270	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2271	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2272	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2273	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2274	if (extent_info->is_new_extent)
2275		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2276	btrfs_mark_buffer_dirty(trans, leaf);
2277	btrfs_release_path(path);
2278
2279	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2280						replace_len);
2281	if (ret)
2282		return ret;
2283
2284	/* If it's a hole, nothing more needs to be done. */
2285	if (extent_info->disk_offset == 0) {
2286		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2287		return 0;
2288	}
2289
2290	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2291
2292	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2293		key.objectid = extent_info->disk_offset;
2294		key.type = BTRFS_EXTENT_ITEM_KEY;
2295		key.offset = extent_info->disk_len;
2296		ret = btrfs_alloc_reserved_file_extent(trans, root,
2297						       btrfs_ino(inode),
2298						       extent_info->file_offset,
2299						       extent_info->qgroup_reserved,
2300						       &key);
2301	} else {
2302		u64 ref_offset;
2303
2304		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2305				       extent_info->disk_offset,
2306				       extent_info->disk_len, 0);
2307		ref_offset = extent_info->file_offset - extent_info->data_offset;
2308		btrfs_init_data_ref(&ref, root->root_key.objectid,
2309				    btrfs_ino(inode), ref_offset, 0, false);
2310		ret = btrfs_inc_extent_ref(trans, &ref);
2311	}
2312
2313	extent_info->insertions++;
2314
2315	return ret;
2316}
2317
2318/*
2319 * The respective range must have been previously locked, as well as the inode.
2320 * The end offset is inclusive (last byte of the range).
2321 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2322 * the file range with an extent.
2323 * When not punching a hole, we don't want to end up in a state where we dropped
2324 * extents without inserting a new one, so we must abort the transaction to avoid
2325 * a corruption.
2326 */
2327int btrfs_replace_file_extents(struct btrfs_inode *inode,
2328			       struct btrfs_path *path, const u64 start,
2329			       const u64 end,
2330			       struct btrfs_replace_extent_info *extent_info,
2331			       struct btrfs_trans_handle **trans_out)
2332{
2333	struct btrfs_drop_extents_args drop_args = { 0 };
2334	struct btrfs_root *root = inode->root;
2335	struct btrfs_fs_info *fs_info = root->fs_info;
2336	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2337	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2338	struct btrfs_trans_handle *trans = NULL;
2339	struct btrfs_block_rsv *rsv;
2340	unsigned int rsv_count;
2341	u64 cur_offset;
2342	u64 len = end - start;
2343	int ret = 0;
2344
2345	if (end <= start)
2346		return -EINVAL;
2347
2348	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2349	if (!rsv) {
2350		ret = -ENOMEM;
2351		goto out;
2352	}
2353	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2354	rsv->failfast = true;
2355
2356	/*
2357	 * 1 - update the inode
2358	 * 1 - removing the extents in the range
2359	 * 1 - adding the hole extent if no_holes isn't set or if we are
2360	 *     replacing the range with a new extent
2361	 */
2362	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2363		rsv_count = 3;
2364	else
2365		rsv_count = 2;
2366
2367	trans = btrfs_start_transaction(root, rsv_count);
2368	if (IS_ERR(trans)) {
2369		ret = PTR_ERR(trans);
2370		trans = NULL;
2371		goto out_free;
2372	}
2373
2374	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2375				      min_size, false);
2376	if (WARN_ON(ret))
2377		goto out_trans;
2378	trans->block_rsv = rsv;
2379
2380	cur_offset = start;
2381	drop_args.path = path;
2382	drop_args.end = end + 1;
2383	drop_args.drop_cache = true;
2384	while (cur_offset < end) {
2385		drop_args.start = cur_offset;
2386		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2387		/* If we are punching a hole decrement the inode's byte count */
2388		if (!extent_info)
2389			btrfs_update_inode_bytes(inode, 0,
2390						 drop_args.bytes_found);
2391		if (ret != -ENOSPC) {
2392			/*
2393			 * The only time we don't want to abort is if we are
2394			 * attempting to clone a partial inline extent, in which
2395			 * case we'll get EOPNOTSUPP.  However if we aren't
2396			 * clone we need to abort no matter what, because if we
2397			 * got EOPNOTSUPP via prealloc then we messed up and
2398			 * need to abort.
2399			 */
2400			if (ret &&
2401			    (ret != -EOPNOTSUPP ||
2402			     (extent_info && extent_info->is_new_extent)))
2403				btrfs_abort_transaction(trans, ret);
2404			break;
2405		}
2406
2407		trans->block_rsv = &fs_info->trans_block_rsv;
2408
2409		if (!extent_info && cur_offset < drop_args.drop_end &&
2410		    cur_offset < ino_size) {
2411			ret = fill_holes(trans, inode, path, cur_offset,
2412					 drop_args.drop_end);
2413			if (ret) {
2414				/*
2415				 * If we failed then we didn't insert our hole
2416				 * entries for the area we dropped, so now the
2417				 * fs is corrupted, so we must abort the
2418				 * transaction.
2419				 */
2420				btrfs_abort_transaction(trans, ret);
2421				break;
2422			}
2423		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2424			/*
2425			 * We are past the i_size here, but since we didn't
2426			 * insert holes we need to clear the mapped area so we
2427			 * know to not set disk_i_size in this area until a new
2428			 * file extent is inserted here.
2429			 */
2430			ret = btrfs_inode_clear_file_extent_range(inode,
2431					cur_offset,
2432					drop_args.drop_end - cur_offset);
2433			if (ret) {
2434				/*
2435				 * We couldn't clear our area, so we could
2436				 * presumably adjust up and corrupt the fs, so
2437				 * we need to abort.
2438				 */
2439				btrfs_abort_transaction(trans, ret);
2440				break;
2441			}
2442		}
2443
2444		if (extent_info &&
2445		    drop_args.drop_end > extent_info->file_offset) {
2446			u64 replace_len = drop_args.drop_end -
2447					  extent_info->file_offset;
2448
2449			ret = btrfs_insert_replace_extent(trans, inode,	path,
2450					extent_info, replace_len,
2451					drop_args.bytes_found);
2452			if (ret) {
2453				btrfs_abort_transaction(trans, ret);
2454				break;
2455			}
2456			extent_info->data_len -= replace_len;
2457			extent_info->data_offset += replace_len;
2458			extent_info->file_offset += replace_len;
2459		}
2460
2461		/*
2462		 * We are releasing our handle on the transaction, balance the
2463		 * dirty pages of the btree inode and flush delayed items, and
2464		 * then get a new transaction handle, which may now point to a
2465		 * new transaction in case someone else may have committed the
2466		 * transaction we used to replace/drop file extent items. So
2467		 * bump the inode's iversion and update mtime and ctime except
2468		 * if we are called from a dedupe context. This is because a
2469		 * power failure/crash may happen after the transaction is
2470		 * committed and before we finish replacing/dropping all the
2471		 * file extent items we need.
2472		 */
2473		inode_inc_iversion(&inode->vfs_inode);
2474
2475		if (!extent_info || extent_info->update_times)
2476			inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
2477
2478		ret = btrfs_update_inode(trans, root, inode);
2479		if (ret)
2480			break;
2481
2482		btrfs_end_transaction(trans);
2483		btrfs_btree_balance_dirty(fs_info);
2484
2485		trans = btrfs_start_transaction(root, rsv_count);
2486		if (IS_ERR(trans)) {
2487			ret = PTR_ERR(trans);
2488			trans = NULL;
2489			break;
2490		}
2491
2492		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2493					      rsv, min_size, false);
2494		if (WARN_ON(ret))
2495			break;
2496		trans->block_rsv = rsv;
2497
2498		cur_offset = drop_args.drop_end;
2499		len = end - cur_offset;
2500		if (!extent_info && len) {
2501			ret = find_first_non_hole(inode, &cur_offset, &len);
2502			if (unlikely(ret < 0))
2503				break;
2504			if (ret && !len) {
2505				ret = 0;
2506				break;
2507			}
2508		}
2509	}
2510
2511	/*
2512	 * If we were cloning, force the next fsync to be a full one since we
2513	 * we replaced (or just dropped in the case of cloning holes when
2514	 * NO_HOLES is enabled) file extent items and did not setup new extent
2515	 * maps for the replacement extents (or holes).
2516	 */
2517	if (extent_info && !extent_info->is_new_extent)
2518		btrfs_set_inode_full_sync(inode);
2519
2520	if (ret)
2521		goto out_trans;
2522
2523	trans->block_rsv = &fs_info->trans_block_rsv;
2524	/*
2525	 * If we are using the NO_HOLES feature we might have had already an
2526	 * hole that overlaps a part of the region [lockstart, lockend] and
2527	 * ends at (or beyond) lockend. Since we have no file extent items to
2528	 * represent holes, drop_end can be less than lockend and so we must
2529	 * make sure we have an extent map representing the existing hole (the
2530	 * call to __btrfs_drop_extents() might have dropped the existing extent
2531	 * map representing the existing hole), otherwise the fast fsync path
2532	 * will not record the existence of the hole region
2533	 * [existing_hole_start, lockend].
2534	 */
2535	if (drop_args.drop_end <= end)
2536		drop_args.drop_end = end + 1;
2537	/*
2538	 * Don't insert file hole extent item if it's for a range beyond eof
2539	 * (because it's useless) or if it represents a 0 bytes range (when
2540	 * cur_offset == drop_end).
2541	 */
2542	if (!extent_info && cur_offset < ino_size &&
2543	    cur_offset < drop_args.drop_end) {
2544		ret = fill_holes(trans, inode, path, cur_offset,
2545				 drop_args.drop_end);
2546		if (ret) {
2547			/* Same comment as above. */
2548			btrfs_abort_transaction(trans, ret);
2549			goto out_trans;
2550		}
2551	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2552		/* See the comment in the loop above for the reasoning here. */
2553		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2554					drop_args.drop_end - cur_offset);
2555		if (ret) {
2556			btrfs_abort_transaction(trans, ret);
2557			goto out_trans;
2558		}
2559
2560	}
2561	if (extent_info) {
2562		ret = btrfs_insert_replace_extent(trans, inode, path,
2563				extent_info, extent_info->data_len,
2564				drop_args.bytes_found);
2565		if (ret) {
2566			btrfs_abort_transaction(trans, ret);
2567			goto out_trans;
2568		}
2569	}
2570
2571out_trans:
2572	if (!trans)
2573		goto out_free;
2574
2575	trans->block_rsv = &fs_info->trans_block_rsv;
2576	if (ret)
2577		btrfs_end_transaction(trans);
2578	else
2579		*trans_out = trans;
2580out_free:
2581	btrfs_free_block_rsv(fs_info, rsv);
2582out:
2583	return ret;
2584}
2585
2586static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2587{
2588	struct inode *inode = file_inode(file);
2589	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2590	struct btrfs_root *root = BTRFS_I(inode)->root;
2591	struct extent_state *cached_state = NULL;
2592	struct btrfs_path *path;
2593	struct btrfs_trans_handle *trans = NULL;
2594	u64 lockstart;
2595	u64 lockend;
2596	u64 tail_start;
2597	u64 tail_len;
2598	u64 orig_start = offset;
2599	int ret = 0;
2600	bool same_block;
2601	u64 ino_size;
2602	bool truncated_block = false;
2603	bool updated_inode = false;
2604
2605	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2606
2607	ret = btrfs_wait_ordered_range(inode, offset, len);
2608	if (ret)
2609		goto out_only_mutex;
2610
2611	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2612	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2613	if (ret < 0)
2614		goto out_only_mutex;
2615	if (ret && !len) {
2616		/* Already in a large hole */
2617		ret = 0;
2618		goto out_only_mutex;
2619	}
2620
2621	ret = file_modified(file);
2622	if (ret)
2623		goto out_only_mutex;
2624
2625	lockstart = round_up(offset, fs_info->sectorsize);
2626	lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2627	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2628		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2629	/*
2630	 * We needn't truncate any block which is beyond the end of the file
2631	 * because we are sure there is no data there.
2632	 */
2633	/*
2634	 * Only do this if we are in the same block and we aren't doing the
2635	 * entire block.
2636	 */
2637	if (same_block && len < fs_info->sectorsize) {
2638		if (offset < ino_size) {
2639			truncated_block = true;
2640			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2641						   0);
2642		} else {
2643			ret = 0;
2644		}
2645		goto out_only_mutex;
2646	}
2647
2648	/* zero back part of the first block */
2649	if (offset < ino_size) {
2650		truncated_block = true;
2651		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2652		if (ret) {
2653			btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2654			return ret;
2655		}
2656	}
2657
2658	/* Check the aligned pages after the first unaligned page,
2659	 * if offset != orig_start, which means the first unaligned page
2660	 * including several following pages are already in holes,
2661	 * the extra check can be skipped */
2662	if (offset == orig_start) {
2663		/* after truncate page, check hole again */
2664		len = offset + len - lockstart;
2665		offset = lockstart;
2666		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2667		if (ret < 0)
2668			goto out_only_mutex;
2669		if (ret && !len) {
2670			ret = 0;
2671			goto out_only_mutex;
2672		}
2673		lockstart = offset;
2674	}
2675
2676	/* Check the tail unaligned part is in a hole */
2677	tail_start = lockend + 1;
2678	tail_len = offset + len - tail_start;
2679	if (tail_len) {
2680		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2681		if (unlikely(ret < 0))
2682			goto out_only_mutex;
2683		if (!ret) {
2684			/* zero the front end of the last page */
2685			if (tail_start + tail_len < ino_size) {
2686				truncated_block = true;
2687				ret = btrfs_truncate_block(BTRFS_I(inode),
2688							tail_start + tail_len,
2689							0, 1);
2690				if (ret)
2691					goto out_only_mutex;
2692			}
2693		}
2694	}
2695
2696	if (lockend < lockstart) {
2697		ret = 0;
2698		goto out_only_mutex;
2699	}
2700
2701	btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2702
2703	path = btrfs_alloc_path();
2704	if (!path) {
2705		ret = -ENOMEM;
2706		goto out;
2707	}
2708
2709	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2710					 lockend, NULL, &trans);
2711	btrfs_free_path(path);
2712	if (ret)
2713		goto out;
2714
2715	ASSERT(trans != NULL);
2716	inode_inc_iversion(inode);
2717	inode->i_mtime = inode_set_ctime_current(inode);
2718	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2719	updated_inode = true;
2720	btrfs_end_transaction(trans);
2721	btrfs_btree_balance_dirty(fs_info);
2722out:
2723	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2724		      &cached_state);
2725out_only_mutex:
2726	if (!updated_inode && truncated_block && !ret) {
2727		/*
2728		 * If we only end up zeroing part of a page, we still need to
2729		 * update the inode item, so that all the time fields are
2730		 * updated as well as the necessary btrfs inode in memory fields
2731		 * for detecting, at fsync time, if the inode isn't yet in the
2732		 * log tree or it's there but not up to date.
2733		 */
2734		struct timespec64 now = inode_set_ctime_current(inode);
2735
2736		inode_inc_iversion(inode);
2737		inode->i_mtime = now;
2738		trans = btrfs_start_transaction(root, 1);
2739		if (IS_ERR(trans)) {
2740			ret = PTR_ERR(trans);
2741		} else {
2742			int ret2;
2743
2744			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2745			ret2 = btrfs_end_transaction(trans);
2746			if (!ret)
2747				ret = ret2;
2748		}
2749	}
2750	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2751	return ret;
2752}
2753
2754/* Helper structure to record which range is already reserved */
2755struct falloc_range {
2756	struct list_head list;
2757	u64 start;
2758	u64 len;
2759};
2760
2761/*
2762 * Helper function to add falloc range
2763 *
2764 * Caller should have locked the larger range of extent containing
2765 * [start, len)
2766 */
2767static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2768{
2769	struct falloc_range *range = NULL;
2770
2771	if (!list_empty(head)) {
2772		/*
2773		 * As fallocate iterates by bytenr order, we only need to check
2774		 * the last range.
2775		 */
2776		range = list_last_entry(head, struct falloc_range, list);
2777		if (range->start + range->len == start) {
2778			range->len += len;
2779			return 0;
2780		}
2781	}
2782
2783	range = kmalloc(sizeof(*range), GFP_KERNEL);
2784	if (!range)
2785		return -ENOMEM;
2786	range->start = start;
2787	range->len = len;
2788	list_add_tail(&range->list, head);
2789	return 0;
2790}
2791
2792static int btrfs_fallocate_update_isize(struct inode *inode,
2793					const u64 end,
2794					const int mode)
2795{
2796	struct btrfs_trans_handle *trans;
2797	struct btrfs_root *root = BTRFS_I(inode)->root;
2798	int ret;
2799	int ret2;
2800
2801	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2802		return 0;
2803
2804	trans = btrfs_start_transaction(root, 1);
2805	if (IS_ERR(trans))
2806		return PTR_ERR(trans);
2807
2808	inode_set_ctime_current(inode);
2809	i_size_write(inode, end);
2810	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2811	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2812	ret2 = btrfs_end_transaction(trans);
2813
2814	return ret ? ret : ret2;
2815}
2816
2817enum {
2818	RANGE_BOUNDARY_WRITTEN_EXTENT,
2819	RANGE_BOUNDARY_PREALLOC_EXTENT,
2820	RANGE_BOUNDARY_HOLE,
2821};
2822
2823static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2824						 u64 offset)
2825{
2826	const u64 sectorsize = inode->root->fs_info->sectorsize;
2827	struct extent_map *em;
2828	int ret;
2829
2830	offset = round_down(offset, sectorsize);
2831	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
2832	if (IS_ERR(em))
2833		return PTR_ERR(em);
2834
2835	if (em->block_start == EXTENT_MAP_HOLE)
2836		ret = RANGE_BOUNDARY_HOLE;
2837	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2838		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2839	else
2840		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2841
2842	free_extent_map(em);
2843	return ret;
2844}
2845
2846static int btrfs_zero_range(struct inode *inode,
2847			    loff_t offset,
2848			    loff_t len,
2849			    const int mode)
2850{
2851	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2852	struct extent_map *em;
2853	struct extent_changeset *data_reserved = NULL;
2854	int ret;
2855	u64 alloc_hint = 0;
2856	const u64 sectorsize = fs_info->sectorsize;
2857	u64 alloc_start = round_down(offset, sectorsize);
2858	u64 alloc_end = round_up(offset + len, sectorsize);
2859	u64 bytes_to_reserve = 0;
2860	bool space_reserved = false;
2861
2862	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
2863			      alloc_end - alloc_start);
2864	if (IS_ERR(em)) {
2865		ret = PTR_ERR(em);
2866		goto out;
2867	}
2868
2869	/*
2870	 * Avoid hole punching and extent allocation for some cases. More cases
2871	 * could be considered, but these are unlikely common and we keep things
2872	 * as simple as possible for now. Also, intentionally, if the target
2873	 * range contains one or more prealloc extents together with regular
2874	 * extents and holes, we drop all the existing extents and allocate a
2875	 * new prealloc extent, so that we get a larger contiguous disk extent.
2876	 */
2877	if (em->start <= alloc_start &&
2878	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2879		const u64 em_end = em->start + em->len;
2880
2881		if (em_end >= offset + len) {
2882			/*
2883			 * The whole range is already a prealloc extent,
2884			 * do nothing except updating the inode's i_size if
2885			 * needed.
2886			 */
2887			free_extent_map(em);
2888			ret = btrfs_fallocate_update_isize(inode, offset + len,
2889							   mode);
2890			goto out;
2891		}
2892		/*
2893		 * Part of the range is already a prealloc extent, so operate
2894		 * only on the remaining part of the range.
2895		 */
2896		alloc_start = em_end;
2897		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2898		len = offset + len - alloc_start;
2899		offset = alloc_start;
2900		alloc_hint = em->block_start + em->len;
2901	}
2902	free_extent_map(em);
2903
2904	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2905	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2906		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
2907				      sectorsize);
2908		if (IS_ERR(em)) {
2909			ret = PTR_ERR(em);
2910			goto out;
2911		}
2912
2913		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2914			free_extent_map(em);
2915			ret = btrfs_fallocate_update_isize(inode, offset + len,
2916							   mode);
2917			goto out;
2918		}
2919		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
2920			free_extent_map(em);
2921			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2922						   0);
2923			if (!ret)
2924				ret = btrfs_fallocate_update_isize(inode,
2925								   offset + len,
2926								   mode);
2927			return ret;
2928		}
2929		free_extent_map(em);
2930		alloc_start = round_down(offset, sectorsize);
2931		alloc_end = alloc_start + sectorsize;
2932		goto reserve_space;
2933	}
2934
2935	alloc_start = round_up(offset, sectorsize);
2936	alloc_end = round_down(offset + len, sectorsize);
2937
2938	/*
2939	 * For unaligned ranges, check the pages at the boundaries, they might
2940	 * map to an extent, in which case we need to partially zero them, or
2941	 * they might map to a hole, in which case we need our allocation range
2942	 * to cover them.
2943	 */
2944	if (!IS_ALIGNED(offset, sectorsize)) {
2945		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2946							    offset);
2947		if (ret < 0)
2948			goto out;
2949		if (ret == RANGE_BOUNDARY_HOLE) {
2950			alloc_start = round_down(offset, sectorsize);
2951			ret = 0;
2952		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2953			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2954			if (ret)
2955				goto out;
2956		} else {
2957			ret = 0;
2958		}
2959	}
2960
2961	if (!IS_ALIGNED(offset + len, sectorsize)) {
2962		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2963							    offset + len);
2964		if (ret < 0)
2965			goto out;
2966		if (ret == RANGE_BOUNDARY_HOLE) {
2967			alloc_end = round_up(offset + len, sectorsize);
2968			ret = 0;
2969		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2970			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
2971						   0, 1);
2972			if (ret)
2973				goto out;
2974		} else {
2975			ret = 0;
2976		}
2977	}
2978
2979reserve_space:
2980	if (alloc_start < alloc_end) {
2981		struct extent_state *cached_state = NULL;
2982		const u64 lockstart = alloc_start;
2983		const u64 lockend = alloc_end - 1;
2984
2985		bytes_to_reserve = alloc_end - alloc_start;
2986		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2987						      bytes_to_reserve);
2988		if (ret < 0)
2989			goto out;
2990		space_reserved = true;
2991		btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2992					    &cached_state);
2993		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2994						alloc_start, bytes_to_reserve);
2995		if (ret) {
2996			unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
2997				      lockend, &cached_state);
2998			goto out;
2999		}
3000		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3001						alloc_end - alloc_start,
3002						i_blocksize(inode),
3003						offset + len, &alloc_hint);
3004		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3005			      &cached_state);
3006		/* btrfs_prealloc_file_range releases reserved space on error */
3007		if (ret) {
3008			space_reserved = false;
3009			goto out;
3010		}
3011	}
3012	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3013 out:
3014	if (ret && space_reserved)
3015		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3016					       alloc_start, bytes_to_reserve);
3017	extent_changeset_free(data_reserved);
3018
3019	return ret;
3020}
3021
3022static long btrfs_fallocate(struct file *file, int mode,
3023			    loff_t offset, loff_t len)
3024{
3025	struct inode *inode = file_inode(file);
3026	struct extent_state *cached_state = NULL;
3027	struct extent_changeset *data_reserved = NULL;
3028	struct falloc_range *range;
3029	struct falloc_range *tmp;
3030	LIST_HEAD(reserve_list);
3031	u64 cur_offset;
3032	u64 last_byte;
3033	u64 alloc_start;
3034	u64 alloc_end;
3035	u64 alloc_hint = 0;
3036	u64 locked_end;
3037	u64 actual_end = 0;
3038	u64 data_space_needed = 0;
3039	u64 data_space_reserved = 0;
3040	u64 qgroup_reserved = 0;
3041	struct extent_map *em;
3042	int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3043	int ret;
3044
3045	/* Do not allow fallocate in ZONED mode */
3046	if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3047		return -EOPNOTSUPP;
3048
3049	alloc_start = round_down(offset, blocksize);
3050	alloc_end = round_up(offset + len, blocksize);
3051	cur_offset = alloc_start;
3052
3053	/* Make sure we aren't being give some crap mode */
3054	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3055		     FALLOC_FL_ZERO_RANGE))
3056		return -EOPNOTSUPP;
3057
3058	if (mode & FALLOC_FL_PUNCH_HOLE)
3059		return btrfs_punch_hole(file, offset, len);
3060
3061	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3062
3063	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3064		ret = inode_newsize_ok(inode, offset + len);
3065		if (ret)
3066			goto out;
3067	}
3068
3069	ret = file_modified(file);
3070	if (ret)
3071		goto out;
3072
3073	/*
3074	 * TODO: Move these two operations after we have checked
3075	 * accurate reserved space, or fallocate can still fail but
3076	 * with page truncated or size expanded.
3077	 *
3078	 * But that's a minor problem and won't do much harm BTW.
3079	 */
3080	if (alloc_start > inode->i_size) {
3081		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3082					alloc_start);
3083		if (ret)
3084			goto out;
3085	} else if (offset + len > inode->i_size) {
3086		/*
3087		 * If we are fallocating from the end of the file onward we
3088		 * need to zero out the end of the block if i_size lands in the
3089		 * middle of a block.
3090		 */
3091		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3092		if (ret)
3093			goto out;
3094	}
3095
3096	/*
3097	 * We have locked the inode at the VFS level (in exclusive mode) and we
3098	 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3099	 * locking the file range, flush all dealloc in the range and wait for
3100	 * all ordered extents in the range to complete. After this we can lock
3101	 * the file range and, due to the previous locking we did, we know there
3102	 * can't be more delalloc or ordered extents in the range.
3103	 */
3104	ret = btrfs_wait_ordered_range(inode, alloc_start,
3105				       alloc_end - alloc_start);
3106	if (ret)
3107		goto out;
3108
3109	if (mode & FALLOC_FL_ZERO_RANGE) {
3110		ret = btrfs_zero_range(inode, offset, len, mode);
3111		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3112		return ret;
3113	}
3114
3115	locked_end = alloc_end - 1;
3116	lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3117		    &cached_state);
3118
3119	btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3120
3121	/* First, check if we exceed the qgroup limit */
3122	while (cur_offset < alloc_end) {
3123		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3124				      alloc_end - cur_offset);
3125		if (IS_ERR(em)) {
3126			ret = PTR_ERR(em);
3127			break;
3128		}
3129		last_byte = min(extent_map_end(em), alloc_end);
3130		actual_end = min_t(u64, extent_map_end(em), offset + len);
3131		last_byte = ALIGN(last_byte, blocksize);
3132		if (em->block_start == EXTENT_MAP_HOLE ||
3133		    (cur_offset >= inode->i_size &&
3134		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3135			const u64 range_len = last_byte - cur_offset;
3136
3137			ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3138			if (ret < 0) {
3139				free_extent_map(em);
3140				break;
3141			}
3142			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3143					&data_reserved, cur_offset, range_len);
3144			if (ret < 0) {
3145				free_extent_map(em);
3146				break;
3147			}
3148			qgroup_reserved += range_len;
3149			data_space_needed += range_len;
3150		}
3151		free_extent_map(em);
3152		cur_offset = last_byte;
3153	}
3154
3155	if (!ret && data_space_needed > 0) {
3156		/*
3157		 * We are safe to reserve space here as we can't have delalloc
3158		 * in the range, see above.
3159		 */
3160		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3161						      data_space_needed);
3162		if (!ret)
3163			data_space_reserved = data_space_needed;
3164	}
3165
3166	/*
3167	 * If ret is still 0, means we're OK to fallocate.
3168	 * Or just cleanup the list and exit.
3169	 */
3170	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3171		if (!ret) {
3172			ret = btrfs_prealloc_file_range(inode, mode,
3173					range->start,
3174					range->len, i_blocksize(inode),
3175					offset + len, &alloc_hint);
3176			/*
3177			 * btrfs_prealloc_file_range() releases space even
3178			 * if it returns an error.
3179			 */
3180			data_space_reserved -= range->len;
3181			qgroup_reserved -= range->len;
3182		} else if (data_space_reserved > 0) {
3183			btrfs_free_reserved_data_space(BTRFS_I(inode),
3184					       data_reserved, range->start,
3185					       range->len);
3186			data_space_reserved -= range->len;
3187			qgroup_reserved -= range->len;
3188		} else if (qgroup_reserved > 0) {
3189			btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3190					       range->start, range->len, NULL);
3191			qgroup_reserved -= range->len;
3192		}
3193		list_del(&range->list);
3194		kfree(range);
3195	}
3196	if (ret < 0)
3197		goto out_unlock;
3198
3199	/*
3200	 * We didn't need to allocate any more space, but we still extended the
3201	 * size of the file so we need to update i_size and the inode item.
3202	 */
3203	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3204out_unlock:
3205	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3206		      &cached_state);
3207out:
3208	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3209	extent_changeset_free(data_reserved);
3210	return ret;
3211}
3212
3213/*
3214 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3215 * that has unflushed and/or flushing delalloc. There might be other adjacent
3216 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3217 * looping while it gets adjacent subranges, and merging them together.
3218 */
3219static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3220				   struct extent_state **cached_state,
3221				   bool *search_io_tree,
3222				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3223{
3224	u64 len = end + 1 - start;
3225	u64 delalloc_len = 0;
3226	struct btrfs_ordered_extent *oe;
3227	u64 oe_start;
3228	u64 oe_end;
3229
3230	/*
3231	 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3232	 * means we have delalloc (dirty pages) for which writeback has not
3233	 * started yet.
3234	 */
3235	if (*search_io_tree) {
3236		spin_lock(&inode->lock);
3237		if (inode->delalloc_bytes > 0) {
3238			spin_unlock(&inode->lock);
3239			*delalloc_start_ret = start;
3240			delalloc_len = count_range_bits(&inode->io_tree,
3241							delalloc_start_ret, end,
3242							len, EXTENT_DELALLOC, 1,
3243							cached_state);
3244		} else {
3245			spin_unlock(&inode->lock);
3246		}
3247	}
3248
3249	if (delalloc_len > 0) {
3250		/*
3251		 * If delalloc was found then *delalloc_start_ret has a sector size
3252		 * aligned value (rounded down).
3253		 */
3254		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3255
3256		if (*delalloc_start_ret == start) {
3257			/* Delalloc for the whole range, nothing more to do. */
3258			if (*delalloc_end_ret == end)
3259				return true;
3260			/* Else trim our search range for ordered extents. */
3261			start = *delalloc_end_ret + 1;
3262			len = end + 1 - start;
3263		}
3264	} else {
3265		/* No delalloc, future calls don't need to search again. */
3266		*search_io_tree = false;
3267	}
3268
3269	/*
3270	 * Now also check if there's any ordered extent in the range.
3271	 * We do this because:
3272	 *
3273	 * 1) When delalloc is flushed, the file range is locked, we clear the
3274	 *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3275	 *    an ordered extent for the write. So we might just have been called
3276	 *    after delalloc is flushed and before the ordered extent completes
3277	 *    and inserts the new file extent item in the subvolume's btree;
3278	 *
3279	 * 2) We may have an ordered extent created by flushing delalloc for a
3280	 *    subrange that starts before the subrange we found marked with
3281	 *    EXTENT_DELALLOC in the io tree.
3282	 *
3283	 * We could also use the extent map tree to find such delalloc that is
3284	 * being flushed, but using the ordered extents tree is more efficient
3285	 * because it's usually much smaller as ordered extents are removed from
3286	 * the tree once they complete. With the extent maps, we mau have them
3287	 * in the extent map tree for a very long time, and they were either
3288	 * created by previous writes or loaded by read operations.
3289	 */
3290	oe = btrfs_lookup_first_ordered_range(inode, start, len);
3291	if (!oe)
3292		return (delalloc_len > 0);
3293
3294	/* The ordered extent may span beyond our search range. */
3295	oe_start = max(oe->file_offset, start);
3296	oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3297
3298	btrfs_put_ordered_extent(oe);
3299
3300	/* Don't have unflushed delalloc, return the ordered extent range. */
3301	if (delalloc_len == 0) {
3302		*delalloc_start_ret = oe_start;
3303		*delalloc_end_ret = oe_end;
3304		return true;
3305	}
3306
3307	/*
3308	 * We have both unflushed delalloc (io_tree) and an ordered extent.
3309	 * If the ranges are adjacent returned a combined range, otherwise
3310	 * return the leftmost range.
3311	 */
3312	if (oe_start < *delalloc_start_ret) {
3313		if (oe_end < *delalloc_start_ret)
3314			*delalloc_end_ret = oe_end;
3315		*delalloc_start_ret = oe_start;
3316	} else if (*delalloc_end_ret + 1 == oe_start) {
3317		*delalloc_end_ret = oe_end;
3318	}
3319
3320	return true;
3321}
3322
3323/*
3324 * Check if there's delalloc in a given range.
3325 *
3326 * @inode:               The inode.
3327 * @start:               The start offset of the range. It does not need to be
3328 *                       sector size aligned.
3329 * @end:                 The end offset (inclusive value) of the search range.
3330 *                       It does not need to be sector size aligned.
3331 * @cached_state:        Extent state record used for speeding up delalloc
3332 *                       searches in the inode's io_tree. Can be NULL.
3333 * @delalloc_start_ret:  Output argument, set to the start offset of the
3334 *                       subrange found with delalloc (may not be sector size
3335 *                       aligned).
3336 * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3337 *                       of the subrange found with delalloc.
3338 *
3339 * Returns true if a subrange with delalloc is found within the given range, and
3340 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3341 * end offsets of the subrange.
3342 */
3343bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3344				  struct extent_state **cached_state,
3345				  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3346{
3347	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3348	u64 prev_delalloc_end = 0;
3349	bool search_io_tree = true;
3350	bool ret = false;
3351
3352	while (cur_offset <= end) {
3353		u64 delalloc_start;
3354		u64 delalloc_end;
3355		bool delalloc;
3356
3357		delalloc = find_delalloc_subrange(inode, cur_offset, end,
3358						  cached_state, &search_io_tree,
3359						  &delalloc_start,
3360						  &delalloc_end);
3361		if (!delalloc)
3362			break;
3363
3364		if (prev_delalloc_end == 0) {
3365			/* First subrange found. */
3366			*delalloc_start_ret = max(delalloc_start, start);
3367			*delalloc_end_ret = delalloc_end;
3368			ret = true;
3369		} else if (delalloc_start == prev_delalloc_end + 1) {
3370			/* Subrange adjacent to the previous one, merge them. */
3371			*delalloc_end_ret = delalloc_end;
3372		} else {
3373			/* Subrange not adjacent to the previous one, exit. */
3374			break;
3375		}
3376
3377		prev_delalloc_end = delalloc_end;
3378		cur_offset = delalloc_end + 1;
3379		cond_resched();
3380	}
3381
3382	return ret;
3383}
3384
3385/*
3386 * Check if there's a hole or delalloc range in a range representing a hole (or
3387 * prealloc extent) found in the inode's subvolume btree.
3388 *
3389 * @inode:      The inode.
3390 * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3391 * @start:      Start offset of the hole region. It does not need to be sector
3392 *              size aligned.
3393 * @end:        End offset (inclusive value) of the hole region. It does not
3394 *              need to be sector size aligned.
3395 * @start_ret:  Return parameter, used to set the start of the subrange in the
3396 *              hole that matches the search criteria (seek mode), if such
3397 *              subrange is found (return value of the function is true).
3398 *              The value returned here may not be sector size aligned.
3399 *
3400 * Returns true if a subrange matching the given seek mode is found, and if one
3401 * is found, it updates @start_ret with the start of the subrange.
3402 */
3403static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3404					struct extent_state **cached_state,
3405					u64 start, u64 end, u64 *start_ret)
3406{
3407	u64 delalloc_start;
3408	u64 delalloc_end;
3409	bool delalloc;
3410
3411	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3412						&delalloc_start, &delalloc_end);
3413	if (delalloc && whence == SEEK_DATA) {
3414		*start_ret = delalloc_start;
3415		return true;
3416	}
3417
3418	if (delalloc && whence == SEEK_HOLE) {
3419		/*
3420		 * We found delalloc but it starts after out start offset. So we
3421		 * have a hole between our start offset and the delalloc start.
3422		 */
3423		if (start < delalloc_start) {
3424			*start_ret = start;
3425			return true;
3426		}
3427		/*
3428		 * Delalloc range starts at our start offset.
3429		 * If the delalloc range's length is smaller than our range,
3430		 * then it means we have a hole that starts where the delalloc
3431		 * subrange ends.
3432		 */
3433		if (delalloc_end < end) {
3434			*start_ret = delalloc_end + 1;
3435			return true;
3436		}
3437
3438		/* There's delalloc for the whole range. */
3439		return false;
3440	}
3441
3442	if (!delalloc && whence == SEEK_HOLE) {
3443		*start_ret = start;
3444		return true;
3445	}
3446
3447	/*
3448	 * No delalloc in the range and we are seeking for data. The caller has
3449	 * to iterate to the next extent item in the subvolume btree.
3450	 */
3451	return false;
3452}
3453
3454static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3455{
3456	struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3457	struct btrfs_file_private *private = file->private_data;
3458	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3459	struct extent_state *cached_state = NULL;
3460	struct extent_state **delalloc_cached_state;
3461	const loff_t i_size = i_size_read(&inode->vfs_inode);
3462	const u64 ino = btrfs_ino(inode);
3463	struct btrfs_root *root = inode->root;
3464	struct btrfs_path *path;
3465	struct btrfs_key key;
3466	u64 last_extent_end;
3467	u64 lockstart;
3468	u64 lockend;
3469	u64 start;
3470	int ret;
3471	bool found = false;
3472
3473	if (i_size == 0 || offset >= i_size)
3474		return -ENXIO;
3475
3476	/*
3477	 * Quick path. If the inode has no prealloc extents and its number of
3478	 * bytes used matches its i_size, then it can not have holes.
3479	 */
3480	if (whence == SEEK_HOLE &&
3481	    !(inode->flags & BTRFS_INODE_PREALLOC) &&
3482	    inode_get_bytes(&inode->vfs_inode) == i_size)
3483		return i_size;
3484
3485	if (!private) {
3486		private = kzalloc(sizeof(*private), GFP_KERNEL);
3487		/*
3488		 * No worries if memory allocation failed.
3489		 * The private structure is used only for speeding up multiple
3490		 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3491		 * so everything will still be correct.
3492		 */
3493		file->private_data = private;
3494	}
3495
3496	if (private)
3497		delalloc_cached_state = &private->llseek_cached_state;
3498	else
3499		delalloc_cached_state = NULL;
3500
3501	/*
3502	 * offset can be negative, in this case we start finding DATA/HOLE from
3503	 * the very start of the file.
3504	 */
3505	start = max_t(loff_t, 0, offset);
3506
3507	lockstart = round_down(start, fs_info->sectorsize);
3508	lockend = round_up(i_size, fs_info->sectorsize);
3509	if (lockend <= lockstart)
3510		lockend = lockstart + fs_info->sectorsize;
3511	lockend--;
3512
3513	path = btrfs_alloc_path();
3514	if (!path)
3515		return -ENOMEM;
3516	path->reada = READA_FORWARD;
3517
3518	key.objectid = ino;
3519	key.type = BTRFS_EXTENT_DATA_KEY;
3520	key.offset = start;
3521
3522	last_extent_end = lockstart;
3523
3524	lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3525
3526	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3527	if (ret < 0) {
3528		goto out;
3529	} else if (ret > 0 && path->slots[0] > 0) {
3530		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3531		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3532			path->slots[0]--;
3533	}
3534
3535	while (start < i_size) {
3536		struct extent_buffer *leaf = path->nodes[0];
3537		struct btrfs_file_extent_item *extent;
3538		u64 extent_end;
3539		u8 type;
3540
3541		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3542			ret = btrfs_next_leaf(root, path);
3543			if (ret < 0)
3544				goto out;
3545			else if (ret > 0)
3546				break;
3547
3548			leaf = path->nodes[0];
3549		}
3550
3551		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3552		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3553			break;
3554
3555		extent_end = btrfs_file_extent_end(path);
3556
3557		/*
3558		 * In the first iteration we may have a slot that points to an
3559		 * extent that ends before our start offset, so skip it.
3560		 */
3561		if (extent_end <= start) {
3562			path->slots[0]++;
3563			continue;
3564		}
3565
3566		/* We have an implicit hole, NO_HOLES feature is likely set. */
3567		if (last_extent_end < key.offset) {
3568			u64 search_start = last_extent_end;
3569			u64 found_start;
3570
3571			/*
3572			 * First iteration, @start matches @offset and it's
3573			 * within the hole.
3574			 */
3575			if (start == offset)
3576				search_start = offset;
3577
3578			found = find_desired_extent_in_hole(inode, whence,
3579							    delalloc_cached_state,
3580							    search_start,
3581							    key.offset - 1,
3582							    &found_start);
3583			if (found) {
3584				start = found_start;
3585				break;
3586			}
3587			/*
3588			 * Didn't find data or a hole (due to delalloc) in the
3589			 * implicit hole range, so need to analyze the extent.
3590			 */
3591		}
3592
3593		extent = btrfs_item_ptr(leaf, path->slots[0],
3594					struct btrfs_file_extent_item);
3595		type = btrfs_file_extent_type(leaf, extent);
3596
3597		/*
3598		 * Can't access the extent's disk_bytenr field if this is an
3599		 * inline extent, since at that offset, it's where the extent
3600		 * data starts.
3601		 */
3602		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3603		    (type == BTRFS_FILE_EXTENT_REG &&
3604		     btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3605			/*
3606			 * Explicit hole or prealloc extent, search for delalloc.
3607			 * A prealloc extent is treated like a hole.
3608			 */
3609			u64 search_start = key.offset;
3610			u64 found_start;
3611
3612			/*
3613			 * First iteration, @start matches @offset and it's
3614			 * within the hole.
3615			 */
3616			if (start == offset)
3617				search_start = offset;
3618
3619			found = find_desired_extent_in_hole(inode, whence,
3620							    delalloc_cached_state,
3621							    search_start,
3622							    extent_end - 1,
3623							    &found_start);
3624			if (found) {
3625				start = found_start;
3626				break;
3627			}
3628			/*
3629			 * Didn't find data or a hole (due to delalloc) in the
3630			 * implicit hole range, so need to analyze the next
3631			 * extent item.
3632			 */
3633		} else {
3634			/*
3635			 * Found a regular or inline extent.
3636			 * If we are seeking for data, adjust the start offset
3637			 * and stop, we're done.
3638			 */
3639			if (whence == SEEK_DATA) {
3640				start = max_t(u64, key.offset, offset);
3641				found = true;
3642				break;
3643			}
3644			/*
3645			 * Else, we are seeking for a hole, check the next file
3646			 * extent item.
3647			 */
3648		}
3649
3650		start = extent_end;
3651		last_extent_end = extent_end;
3652		path->slots[0]++;
3653		if (fatal_signal_pending(current)) {
3654			ret = -EINTR;
3655			goto out;
3656		}
3657		cond_resched();
3658	}
3659
3660	/* We have an implicit hole from the last extent found up to i_size. */
3661	if (!found && start < i_size) {
3662		found = find_desired_extent_in_hole(inode, whence,
3663						    delalloc_cached_state, start,
3664						    i_size - 1, &start);
3665		if (!found)
3666			start = i_size;
3667	}
3668
3669out:
3670	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3671	btrfs_free_path(path);
3672
3673	if (ret < 0)
3674		return ret;
3675
3676	if (whence == SEEK_DATA && start >= i_size)
3677		return -ENXIO;
3678
3679	return min_t(loff_t, start, i_size);
3680}
3681
3682static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3683{
3684	struct inode *inode = file->f_mapping->host;
3685
3686	switch (whence) {
3687	default:
3688		return generic_file_llseek(file, offset, whence);
3689	case SEEK_DATA:
3690	case SEEK_HOLE:
3691		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3692		offset = find_desired_extent(file, offset, whence);
3693		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3694		break;
3695	}
3696
3697	if (offset < 0)
3698		return offset;
3699
3700	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3701}
3702
3703static int btrfs_file_open(struct inode *inode, struct file *filp)
3704{
3705	int ret;
3706
3707	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
3708		        FMODE_CAN_ODIRECT;
3709
3710	ret = fsverity_file_open(inode, filp);
3711	if (ret)
3712		return ret;
3713	return generic_file_open(inode, filp);
3714}
3715
3716static int check_direct_read(struct btrfs_fs_info *fs_info,
3717			     const struct iov_iter *iter, loff_t offset)
3718{
3719	int ret;
3720	int i, seg;
3721
3722	ret = check_direct_IO(fs_info, iter, offset);
3723	if (ret < 0)
3724		return ret;
3725
3726	if (!iter_is_iovec(iter))
3727		return 0;
3728
3729	for (seg = 0; seg < iter->nr_segs; seg++) {
3730		for (i = seg + 1; i < iter->nr_segs; i++) {
3731			const struct iovec *iov1 = iter_iov(iter) + seg;
3732			const struct iovec *iov2 = iter_iov(iter) + i;
3733
3734			if (iov1->iov_base == iov2->iov_base)
3735				return -EINVAL;
3736		}
3737	}
3738	return 0;
3739}
3740
3741static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3742{
3743	struct inode *inode = file_inode(iocb->ki_filp);
3744	size_t prev_left = 0;
3745	ssize_t read = 0;
3746	ssize_t ret;
3747
3748	if (fsverity_active(inode))
3749		return 0;
3750
3751	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3752		return 0;
3753
3754	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3755again:
3756	/*
3757	 * This is similar to what we do for direct IO writes, see the comment
3758	 * at btrfs_direct_write(), but we also disable page faults in addition
3759	 * to disabling them only at the iov_iter level. This is because when
3760	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3761	 * which can still trigger page fault ins despite having set ->nofault
3762	 * to true of our 'to' iov_iter.
3763	 *
3764	 * The difference to direct IO writes is that we deadlock when trying
3765	 * to lock the extent range in the inode's tree during he page reads
3766	 * triggered by the fault in (while for writes it is due to waiting for
3767	 * our own ordered extent). This is because for direct IO reads,
3768	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
3769	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
3770	 */
3771	pagefault_disable();
3772	to->nofault = true;
3773	ret = btrfs_dio_read(iocb, to, read);
3774	to->nofault = false;
3775	pagefault_enable();
3776
3777	/* No increment (+=) because iomap returns a cumulative value. */
3778	if (ret > 0)
3779		read = ret;
3780
3781	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3782		const size_t left = iov_iter_count(to);
3783
3784		if (left == prev_left) {
3785			/*
3786			 * We didn't make any progress since the last attempt,
3787			 * fallback to a buffered read for the remainder of the
3788			 * range. This is just to avoid any possibility of looping
3789			 * for too long.
3790			 */
3791			ret = read;
3792		} else {
3793			/*
3794			 * We made some progress since the last retry or this is
3795			 * the first time we are retrying. Fault in as many pages
3796			 * as possible and retry.
3797			 */
3798			fault_in_iov_iter_writeable(to, left);
3799			prev_left = left;
3800			goto again;
3801		}
3802	}
3803	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3804	return ret < 0 ? ret : read;
3805}
3806
3807static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3808{
3809	ssize_t ret = 0;
3810
3811	if (iocb->ki_flags & IOCB_DIRECT) {
3812		ret = btrfs_direct_read(iocb, to);
3813		if (ret < 0 || !iov_iter_count(to) ||
3814		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3815			return ret;
3816	}
3817
3818	return filemap_read(iocb, to, ret);
3819}
3820
3821const struct file_operations btrfs_file_operations = {
3822	.llseek		= btrfs_file_llseek,
3823	.read_iter      = btrfs_file_read_iter,
3824	.splice_read	= filemap_splice_read,
3825	.write_iter	= btrfs_file_write_iter,
3826	.splice_write	= iter_file_splice_write,
3827	.mmap		= btrfs_file_mmap,
3828	.open		= btrfs_file_open,
3829	.release	= btrfs_release_file,
3830	.get_unmapped_area = thp_get_unmapped_area,
3831	.fsync		= btrfs_sync_file,
3832	.fallocate	= btrfs_fallocate,
3833	.unlocked_ioctl	= btrfs_ioctl,
3834#ifdef CONFIG_COMPAT
3835	.compat_ioctl	= btrfs_compat_ioctl,
3836#endif
3837	.remap_file_range = btrfs_remap_file_range,
3838};
3839
3840int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3841{
3842	int ret;
3843
3844	/*
3845	 * So with compression we will find and lock a dirty page and clear the
3846	 * first one as dirty, setup an async extent, and immediately return
3847	 * with the entire range locked but with nobody actually marked with
3848	 * writeback.  So we can't just filemap_write_and_wait_range() and
3849	 * expect it to work since it will just kick off a thread to do the
3850	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3851	 * since it will wait on the page lock, which won't be unlocked until
3852	 * after the pages have been marked as writeback and so we're good to go
3853	 * from there.  We have to do this otherwise we'll miss the ordered
3854	 * extents and that results in badness.  Please Josef, do not think you
3855	 * know better and pull this out at some point in the future, it is
3856	 * right and you are wrong.
3857	 */
3858	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3859	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3860			     &BTRFS_I(inode)->runtime_flags))
3861		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3862
3863	return ret;
3864}
3865