xref: /kernel/linux/linux-5.10/fs/btrfs/file.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/pagemap.h>
8#include <linux/time.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/backing-dev.h>
12#include <linux/falloc.h>
13#include <linux/writeback.h>
14#include <linux/compat.h>
15#include <linux/slab.h>
16#include <linux/btrfs.h>
17#include <linux/uio.h>
18#include <linux/iversion.h>
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22#include "btrfs_inode.h"
23#include "print-tree.h"
24#include "tree-log.h"
25#include "locking.h"
26#include "volumes.h"
27#include "qgroup.h"
28#include "compression.h"
29#include "delalloc-space.h"
30#include "reflink.h"
31
32static struct kmem_cache *btrfs_inode_defrag_cachep;
33/*
34 * when auto defrag is enabled we
35 * queue up these defrag structs to remember which
36 * inodes need defragging passes
37 */
38struct inode_defrag {
39	struct rb_node rb_node;
40	/* objectid */
41	u64 ino;
42	/*
43	 * transid where the defrag was added, we search for
44	 * extents newer than this
45	 */
46	u64 transid;
47
48	/* root objectid */
49	u64 root;
50
51	/* last offset we were able to defrag */
52	u64 last_offset;
53
54	/* if we've wrapped around back to zero once already */
55	int cycled;
56};
57
58static int __compare_inode_defrag(struct inode_defrag *defrag1,
59				  struct inode_defrag *defrag2)
60{
61	if (defrag1->root > defrag2->root)
62		return 1;
63	else if (defrag1->root < defrag2->root)
64		return -1;
65	else if (defrag1->ino > defrag2->ino)
66		return 1;
67	else if (defrag1->ino < defrag2->ino)
68		return -1;
69	else
70		return 0;
71}
72
73/* pop a record for an inode into the defrag tree.  The lock
74 * must be held already
75 *
76 * If you're inserting a record for an older transid than an
77 * existing record, the transid already in the tree is lowered
78 *
79 * If an existing record is found the defrag item you
80 * pass in is freed
81 */
82static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
83				    struct inode_defrag *defrag)
84{
85	struct btrfs_fs_info *fs_info = inode->root->fs_info;
86	struct inode_defrag *entry;
87	struct rb_node **p;
88	struct rb_node *parent = NULL;
89	int ret;
90
91	p = &fs_info->defrag_inodes.rb_node;
92	while (*p) {
93		parent = *p;
94		entry = rb_entry(parent, struct inode_defrag, rb_node);
95
96		ret = __compare_inode_defrag(defrag, entry);
97		if (ret < 0)
98			p = &parent->rb_left;
99		else if (ret > 0)
100			p = &parent->rb_right;
101		else {
102			/* if we're reinserting an entry for
103			 * an old defrag run, make sure to
104			 * lower the transid of our existing record
105			 */
106			if (defrag->transid < entry->transid)
107				entry->transid = defrag->transid;
108			if (defrag->last_offset > entry->last_offset)
109				entry->last_offset = defrag->last_offset;
110			return -EEXIST;
111		}
112	}
113	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
114	rb_link_node(&defrag->rb_node, parent, p);
115	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
116	return 0;
117}
118
119static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
120{
121	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
122		return 0;
123
124	if (btrfs_fs_closing(fs_info))
125		return 0;
126
127	return 1;
128}
129
130/*
131 * insert a defrag record for this inode if auto defrag is
132 * enabled
133 */
134int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
135			   struct btrfs_inode *inode)
136{
137	struct btrfs_root *root = inode->root;
138	struct btrfs_fs_info *fs_info = root->fs_info;
139	struct inode_defrag *defrag;
140	u64 transid;
141	int ret;
142
143	if (!__need_auto_defrag(fs_info))
144		return 0;
145
146	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
147		return 0;
148
149	if (trans)
150		transid = trans->transid;
151	else
152		transid = inode->root->last_trans;
153
154	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
155	if (!defrag)
156		return -ENOMEM;
157
158	defrag->ino = btrfs_ino(inode);
159	defrag->transid = transid;
160	defrag->root = root->root_key.objectid;
161
162	spin_lock(&fs_info->defrag_inodes_lock);
163	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
164		/*
165		 * If we set IN_DEFRAG flag and evict the inode from memory,
166		 * and then re-read this inode, this new inode doesn't have
167		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
168		 */
169		ret = __btrfs_add_inode_defrag(inode, defrag);
170		if (ret)
171			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
172	} else {
173		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
174	}
175	spin_unlock(&fs_info->defrag_inodes_lock);
176	return 0;
177}
178
179/*
180 * Requeue the defrag object. If there is a defrag object that points to
181 * the same inode in the tree, we will merge them together (by
182 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
183 */
184static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
185				       struct inode_defrag *defrag)
186{
187	struct btrfs_fs_info *fs_info = inode->root->fs_info;
188	int ret;
189
190	if (!__need_auto_defrag(fs_info))
191		goto out;
192
193	/*
194	 * Here we don't check the IN_DEFRAG flag, because we need merge
195	 * them together.
196	 */
197	spin_lock(&fs_info->defrag_inodes_lock);
198	ret = __btrfs_add_inode_defrag(inode, defrag);
199	spin_unlock(&fs_info->defrag_inodes_lock);
200	if (ret)
201		goto out;
202	return;
203out:
204	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
205}
206
207/*
208 * pick the defragable inode that we want, if it doesn't exist, we will get
209 * the next one.
210 */
211static struct inode_defrag *
212btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
213{
214	struct inode_defrag *entry = NULL;
215	struct inode_defrag tmp;
216	struct rb_node *p;
217	struct rb_node *parent = NULL;
218	int ret;
219
220	tmp.ino = ino;
221	tmp.root = root;
222
223	spin_lock(&fs_info->defrag_inodes_lock);
224	p = fs_info->defrag_inodes.rb_node;
225	while (p) {
226		parent = p;
227		entry = rb_entry(parent, struct inode_defrag, rb_node);
228
229		ret = __compare_inode_defrag(&tmp, entry);
230		if (ret < 0)
231			p = parent->rb_left;
232		else if (ret > 0)
233			p = parent->rb_right;
234		else
235			goto out;
236	}
237
238	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
239		parent = rb_next(parent);
240		if (parent)
241			entry = rb_entry(parent, struct inode_defrag, rb_node);
242		else
243			entry = NULL;
244	}
245out:
246	if (entry)
247		rb_erase(parent, &fs_info->defrag_inodes);
248	spin_unlock(&fs_info->defrag_inodes_lock);
249	return entry;
250}
251
252void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
253{
254	struct inode_defrag *defrag;
255	struct rb_node *node;
256
257	spin_lock(&fs_info->defrag_inodes_lock);
258	node = rb_first(&fs_info->defrag_inodes);
259	while (node) {
260		rb_erase(node, &fs_info->defrag_inodes);
261		defrag = rb_entry(node, struct inode_defrag, rb_node);
262		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
263
264		cond_resched_lock(&fs_info->defrag_inodes_lock);
265
266		node = rb_first(&fs_info->defrag_inodes);
267	}
268	spin_unlock(&fs_info->defrag_inodes_lock);
269}
270
271#define BTRFS_DEFRAG_BATCH	1024
272
273static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
274				    struct inode_defrag *defrag)
275{
276	struct btrfs_root *inode_root;
277	struct inode *inode;
278	struct btrfs_ioctl_defrag_range_args range;
279	int num_defrag;
280	int ret;
281
282	/* get the inode */
283	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
284	if (IS_ERR(inode_root)) {
285		ret = PTR_ERR(inode_root);
286		goto cleanup;
287	}
288
289	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
290	btrfs_put_root(inode_root);
291	if (IS_ERR(inode)) {
292		ret = PTR_ERR(inode);
293		goto cleanup;
294	}
295
296	/* do a chunk of defrag */
297	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
298	memset(&range, 0, sizeof(range));
299	range.len = (u64)-1;
300	range.start = defrag->last_offset;
301
302	sb_start_write(fs_info->sb);
303	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
304				       BTRFS_DEFRAG_BATCH);
305	sb_end_write(fs_info->sb);
306	/*
307	 * if we filled the whole defrag batch, there
308	 * must be more work to do.  Queue this defrag
309	 * again
310	 */
311	if (num_defrag == BTRFS_DEFRAG_BATCH) {
312		defrag->last_offset = range.start;
313		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
314	} else if (defrag->last_offset && !defrag->cycled) {
315		/*
316		 * we didn't fill our defrag batch, but
317		 * we didn't start at zero.  Make sure we loop
318		 * around to the start of the file.
319		 */
320		defrag->last_offset = 0;
321		defrag->cycled = 1;
322		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
323	} else {
324		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
325	}
326
327	iput(inode);
328	return 0;
329cleanup:
330	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
331	return ret;
332}
333
334/*
335 * run through the list of inodes in the FS that need
336 * defragging
337 */
338int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
339{
340	struct inode_defrag *defrag;
341	u64 first_ino = 0;
342	u64 root_objectid = 0;
343
344	atomic_inc(&fs_info->defrag_running);
345	while (1) {
346		/* Pause the auto defragger. */
347		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
348			     &fs_info->fs_state))
349			break;
350
351		if (!__need_auto_defrag(fs_info))
352			break;
353
354		/* find an inode to defrag */
355		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
356						 first_ino);
357		if (!defrag) {
358			if (root_objectid || first_ino) {
359				root_objectid = 0;
360				first_ino = 0;
361				continue;
362			} else {
363				break;
364			}
365		}
366
367		first_ino = defrag->ino + 1;
368		root_objectid = defrag->root;
369
370		__btrfs_run_defrag_inode(fs_info, defrag);
371	}
372	atomic_dec(&fs_info->defrag_running);
373
374	/*
375	 * during unmount, we use the transaction_wait queue to
376	 * wait for the defragger to stop
377	 */
378	wake_up(&fs_info->transaction_wait);
379	return 0;
380}
381
382/* simple helper to fault in pages and copy.  This should go away
383 * and be replaced with calls into generic code.
384 */
385static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
386					 struct page **prepared_pages,
387					 struct iov_iter *i)
388{
389	size_t copied = 0;
390	size_t total_copied = 0;
391	int pg = 0;
392	int offset = offset_in_page(pos);
393
394	while (write_bytes > 0) {
395		size_t count = min_t(size_t,
396				     PAGE_SIZE - offset, write_bytes);
397		struct page *page = prepared_pages[pg];
398		/*
399		 * Copy data from userspace to the current page
400		 */
401		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
402
403		/* Flush processor's dcache for this page */
404		flush_dcache_page(page);
405
406		/*
407		 * if we get a partial write, we can end up with
408		 * partially up to date pages.  These add
409		 * a lot of complexity, so make sure they don't
410		 * happen by forcing this copy to be retried.
411		 *
412		 * The rest of the btrfs_file_write code will fall
413		 * back to page at a time copies after we return 0.
414		 */
415		if (!PageUptodate(page) && copied < count)
416			copied = 0;
417
418		iov_iter_advance(i, copied);
419		write_bytes -= copied;
420		total_copied += copied;
421
422		/* Return to btrfs_file_write_iter to fault page */
423		if (unlikely(copied == 0))
424			break;
425
426		if (copied < PAGE_SIZE - offset) {
427			offset += copied;
428		} else {
429			pg++;
430			offset = 0;
431		}
432	}
433	return total_copied;
434}
435
436/*
437 * unlocks pages after btrfs_file_write is done with them
438 */
439static void btrfs_drop_pages(struct page **pages, size_t num_pages)
440{
441	size_t i;
442	for (i = 0; i < num_pages; i++) {
443		/* page checked is some magic around finding pages that
444		 * have been modified without going through btrfs_set_page_dirty
445		 * clear it here. There should be no need to mark the pages
446		 * accessed as prepare_pages should have marked them accessed
447		 * in prepare_pages via find_or_create_page()
448		 */
449		ClearPageChecked(pages[i]);
450		unlock_page(pages[i]);
451		put_page(pages[i]);
452	}
453}
454
455/*
456 * after copy_from_user, pages need to be dirtied and we need to make
457 * sure holes are created between the current EOF and the start of
458 * any next extents (if required).
459 *
460 * this also makes the decision about creating an inline extent vs
461 * doing real data extents, marking pages dirty and delalloc as required.
462 */
463int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
464		      size_t num_pages, loff_t pos, size_t write_bytes,
465		      struct extent_state **cached)
466{
467	struct btrfs_fs_info *fs_info = inode->root->fs_info;
468	int err = 0;
469	int i;
470	u64 num_bytes;
471	u64 start_pos;
472	u64 end_of_last_block;
473	u64 end_pos = pos + write_bytes;
474	loff_t isize = i_size_read(&inode->vfs_inode);
475	unsigned int extra_bits = 0;
476
477	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
478	num_bytes = round_up(write_bytes + pos - start_pos,
479			     fs_info->sectorsize);
480
481	end_of_last_block = start_pos + num_bytes - 1;
482
483	/*
484	 * The pages may have already been dirty, clear out old accounting so
485	 * we can set things up properly
486	 */
487	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
488			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
489			 0, 0, cached);
490
491	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
492					extra_bits, cached);
493	if (err)
494		return err;
495
496	for (i = 0; i < num_pages; i++) {
497		struct page *p = pages[i];
498		SetPageUptodate(p);
499		ClearPageChecked(p);
500		set_page_dirty(p);
501	}
502
503	/*
504	 * we've only changed i_size in ram, and we haven't updated
505	 * the disk i_size.  There is no need to log the inode
506	 * at this time.
507	 */
508	if (end_pos > isize)
509		i_size_write(&inode->vfs_inode, end_pos);
510	return 0;
511}
512
513/*
514 * this drops all the extents in the cache that intersect the range
515 * [start, end].  Existing extents are split as required.
516 */
517void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
518			     int skip_pinned)
519{
520	struct extent_map *em;
521	struct extent_map *split = NULL;
522	struct extent_map *split2 = NULL;
523	struct extent_map_tree *em_tree = &inode->extent_tree;
524	u64 len = end - start + 1;
525	u64 gen;
526	int ret;
527	int testend = 1;
528	unsigned long flags;
529	int compressed = 0;
530	bool modified;
531
532	WARN_ON(end < start);
533	if (end == (u64)-1) {
534		len = (u64)-1;
535		testend = 0;
536	}
537	while (1) {
538		int no_splits = 0;
539
540		modified = false;
541		if (!split)
542			split = alloc_extent_map();
543		if (!split2)
544			split2 = alloc_extent_map();
545		if (!split || !split2)
546			no_splits = 1;
547
548		write_lock(&em_tree->lock);
549		em = lookup_extent_mapping(em_tree, start, len);
550		if (!em) {
551			write_unlock(&em_tree->lock);
552			break;
553		}
554		flags = em->flags;
555		gen = em->generation;
556		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
557			if (testend && em->start + em->len >= start + len) {
558				free_extent_map(em);
559				write_unlock(&em_tree->lock);
560				break;
561			}
562			start = em->start + em->len;
563			if (testend)
564				len = start + len - (em->start + em->len);
565			free_extent_map(em);
566			write_unlock(&em_tree->lock);
567			continue;
568		}
569		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
570		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
571		clear_bit(EXTENT_FLAG_LOGGING, &flags);
572		modified = !list_empty(&em->list);
573		if (no_splits)
574			goto next;
575
576		if (em->start < start) {
577			split->start = em->start;
578			split->len = start - em->start;
579
580			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
581				split->orig_start = em->orig_start;
582				split->block_start = em->block_start;
583
584				if (compressed)
585					split->block_len = em->block_len;
586				else
587					split->block_len = split->len;
588				split->orig_block_len = max(split->block_len,
589						em->orig_block_len);
590				split->ram_bytes = em->ram_bytes;
591			} else {
592				split->orig_start = split->start;
593				split->block_len = 0;
594				split->block_start = em->block_start;
595				split->orig_block_len = 0;
596				split->ram_bytes = split->len;
597			}
598
599			split->generation = gen;
600			split->flags = flags;
601			split->compress_type = em->compress_type;
602			replace_extent_mapping(em_tree, em, split, modified);
603			free_extent_map(split);
604			split = split2;
605			split2 = NULL;
606		}
607		if (testend && em->start + em->len > start + len) {
608			u64 diff = start + len - em->start;
609
610			split->start = start + len;
611			split->len = em->start + em->len - (start + len);
612			split->flags = flags;
613			split->compress_type = em->compress_type;
614			split->generation = gen;
615
616			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
617				split->orig_block_len = max(em->block_len,
618						    em->orig_block_len);
619
620				split->ram_bytes = em->ram_bytes;
621				if (compressed) {
622					split->block_len = em->block_len;
623					split->block_start = em->block_start;
624					split->orig_start = em->orig_start;
625				} else {
626					split->block_len = split->len;
627					split->block_start = em->block_start
628						+ diff;
629					split->orig_start = em->orig_start;
630				}
631			} else {
632				split->ram_bytes = split->len;
633				split->orig_start = split->start;
634				split->block_len = 0;
635				split->block_start = em->block_start;
636				split->orig_block_len = 0;
637			}
638
639			if (extent_map_in_tree(em)) {
640				replace_extent_mapping(em_tree, em, split,
641						       modified);
642			} else {
643				ret = add_extent_mapping(em_tree, split,
644							 modified);
645				ASSERT(ret == 0); /* Logic error */
646			}
647			free_extent_map(split);
648			split = NULL;
649		}
650next:
651		if (extent_map_in_tree(em))
652			remove_extent_mapping(em_tree, em);
653		write_unlock(&em_tree->lock);
654
655		/* once for us */
656		free_extent_map(em);
657		/* once for the tree*/
658		free_extent_map(em);
659	}
660	if (split)
661		free_extent_map(split);
662	if (split2)
663		free_extent_map(split2);
664}
665
666/*
667 * this is very complex, but the basic idea is to drop all extents
668 * in the range start - end.  hint_block is filled in with a block number
669 * that would be a good hint to the block allocator for this file.
670 *
671 * If an extent intersects the range but is not entirely inside the range
672 * it is either truncated or split.  Anything entirely inside the range
673 * is deleted from the tree.
674 */
675int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
676			 struct btrfs_root *root, struct btrfs_inode *inode,
677			 struct btrfs_path *path, u64 start, u64 end,
678			 u64 *drop_end, int drop_cache,
679			 int replace_extent,
680			 u32 extent_item_size,
681			 int *key_inserted)
682{
683	struct btrfs_fs_info *fs_info = root->fs_info;
684	struct extent_buffer *leaf;
685	struct btrfs_file_extent_item *fi;
686	struct btrfs_ref ref = { 0 };
687	struct btrfs_key key;
688	struct btrfs_key new_key;
689	struct inode *vfs_inode = &inode->vfs_inode;
690	u64 ino = btrfs_ino(inode);
691	u64 search_start = start;
692	u64 disk_bytenr = 0;
693	u64 num_bytes = 0;
694	u64 extent_offset = 0;
695	u64 extent_end = 0;
696	u64 last_end = start;
697	int del_nr = 0;
698	int del_slot = 0;
699	int extent_type;
700	int recow;
701	int ret;
702	int modify_tree = -1;
703	int update_refs;
704	int found = 0;
705	int leafs_visited = 0;
706
707	if (drop_cache)
708		btrfs_drop_extent_cache(inode, start, end - 1, 0);
709
710	if (start >= inode->disk_i_size && !replace_extent)
711		modify_tree = 0;
712
713	update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
714	while (1) {
715		recow = 0;
716		ret = btrfs_lookup_file_extent(trans, root, path, ino,
717					       search_start, modify_tree);
718		if (ret < 0)
719			break;
720		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
721			leaf = path->nodes[0];
722			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
723			if (key.objectid == ino &&
724			    key.type == BTRFS_EXTENT_DATA_KEY)
725				path->slots[0]--;
726		}
727		ret = 0;
728		leafs_visited++;
729next_slot:
730		leaf = path->nodes[0];
731		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
732			BUG_ON(del_nr > 0);
733			ret = btrfs_next_leaf(root, path);
734			if (ret < 0)
735				break;
736			if (ret > 0) {
737				ret = 0;
738				break;
739			}
740			leafs_visited++;
741			leaf = path->nodes[0];
742			recow = 1;
743		}
744
745		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
746
747		if (key.objectid > ino)
748			break;
749		if (WARN_ON_ONCE(key.objectid < ino) ||
750		    key.type < BTRFS_EXTENT_DATA_KEY) {
751			ASSERT(del_nr == 0);
752			path->slots[0]++;
753			goto next_slot;
754		}
755		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
756			break;
757
758		fi = btrfs_item_ptr(leaf, path->slots[0],
759				    struct btrfs_file_extent_item);
760		extent_type = btrfs_file_extent_type(leaf, fi);
761
762		if (extent_type == BTRFS_FILE_EXTENT_REG ||
763		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
764			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
765			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
766			extent_offset = btrfs_file_extent_offset(leaf, fi);
767			extent_end = key.offset +
768				btrfs_file_extent_num_bytes(leaf, fi);
769		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
770			extent_end = key.offset +
771				btrfs_file_extent_ram_bytes(leaf, fi);
772		} else {
773			/* can't happen */
774			BUG();
775		}
776
777		/*
778		 * Don't skip extent items representing 0 byte lengths. They
779		 * used to be created (bug) if while punching holes we hit
780		 * -ENOSPC condition. So if we find one here, just ensure we
781		 * delete it, otherwise we would insert a new file extent item
782		 * with the same key (offset) as that 0 bytes length file
783		 * extent item in the call to setup_items_for_insert() later
784		 * in this function.
785		 */
786		if (extent_end == key.offset && extent_end >= search_start) {
787			last_end = extent_end;
788			goto delete_extent_item;
789		}
790
791		if (extent_end <= search_start) {
792			path->slots[0]++;
793			goto next_slot;
794		}
795
796		found = 1;
797		search_start = max(key.offset, start);
798		if (recow || !modify_tree) {
799			modify_tree = -1;
800			btrfs_release_path(path);
801			continue;
802		}
803
804		/*
805		 *     | - range to drop - |
806		 *  | -------- extent -------- |
807		 */
808		if (start > key.offset && end < extent_end) {
809			BUG_ON(del_nr > 0);
810			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
811				ret = -EOPNOTSUPP;
812				break;
813			}
814
815			memcpy(&new_key, &key, sizeof(new_key));
816			new_key.offset = start;
817			ret = btrfs_duplicate_item(trans, root, path,
818						   &new_key);
819			if (ret == -EAGAIN) {
820				btrfs_release_path(path);
821				continue;
822			}
823			if (ret < 0)
824				break;
825
826			leaf = path->nodes[0];
827			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
828					    struct btrfs_file_extent_item);
829			btrfs_set_file_extent_num_bytes(leaf, fi,
830							start - key.offset);
831
832			fi = btrfs_item_ptr(leaf, path->slots[0],
833					    struct btrfs_file_extent_item);
834
835			extent_offset += start - key.offset;
836			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
837			btrfs_set_file_extent_num_bytes(leaf, fi,
838							extent_end - start);
839			btrfs_mark_buffer_dirty(leaf);
840
841			if (update_refs && disk_bytenr > 0) {
842				btrfs_init_generic_ref(&ref,
843						BTRFS_ADD_DELAYED_REF,
844						disk_bytenr, num_bytes, 0);
845				btrfs_init_data_ref(&ref,
846						root->root_key.objectid,
847						new_key.objectid,
848						start - extent_offset);
849				ret = btrfs_inc_extent_ref(trans, &ref);
850				BUG_ON(ret); /* -ENOMEM */
851			}
852			key.offset = start;
853		}
854		/*
855		 * From here on out we will have actually dropped something, so
856		 * last_end can be updated.
857		 */
858		last_end = extent_end;
859
860		/*
861		 *  | ---- range to drop ----- |
862		 *      | -------- extent -------- |
863		 */
864		if (start <= key.offset && end < extent_end) {
865			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
866				ret = -EOPNOTSUPP;
867				break;
868			}
869
870			memcpy(&new_key, &key, sizeof(new_key));
871			new_key.offset = end;
872			btrfs_set_item_key_safe(fs_info, path, &new_key);
873
874			extent_offset += end - key.offset;
875			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
876			btrfs_set_file_extent_num_bytes(leaf, fi,
877							extent_end - end);
878			btrfs_mark_buffer_dirty(leaf);
879			if (update_refs && disk_bytenr > 0)
880				inode_sub_bytes(vfs_inode, end - key.offset);
881			break;
882		}
883
884		search_start = extent_end;
885		/*
886		 *       | ---- range to drop ----- |
887		 *  | -------- extent -------- |
888		 */
889		if (start > key.offset && end >= extent_end) {
890			BUG_ON(del_nr > 0);
891			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
892				ret = -EOPNOTSUPP;
893				break;
894			}
895
896			btrfs_set_file_extent_num_bytes(leaf, fi,
897							start - key.offset);
898			btrfs_mark_buffer_dirty(leaf);
899			if (update_refs && disk_bytenr > 0)
900				inode_sub_bytes(vfs_inode, extent_end - start);
901			if (end == extent_end)
902				break;
903
904			path->slots[0]++;
905			goto next_slot;
906		}
907
908		/*
909		 *  | ---- range to drop ----- |
910		 *    | ------ extent ------ |
911		 */
912		if (start <= key.offset && end >= extent_end) {
913delete_extent_item:
914			if (del_nr == 0) {
915				del_slot = path->slots[0];
916				del_nr = 1;
917			} else {
918				BUG_ON(del_slot + del_nr != path->slots[0]);
919				del_nr++;
920			}
921
922			if (update_refs &&
923			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
924				inode_sub_bytes(vfs_inode,
925						extent_end - key.offset);
926				extent_end = ALIGN(extent_end,
927						   fs_info->sectorsize);
928			} else if (update_refs && disk_bytenr > 0) {
929				btrfs_init_generic_ref(&ref,
930						BTRFS_DROP_DELAYED_REF,
931						disk_bytenr, num_bytes, 0);
932				btrfs_init_data_ref(&ref,
933						root->root_key.objectid,
934						key.objectid,
935						key.offset - extent_offset);
936				ret = btrfs_free_extent(trans, &ref);
937				BUG_ON(ret); /* -ENOMEM */
938				inode_sub_bytes(vfs_inode,
939						extent_end - key.offset);
940			}
941
942			if (end == extent_end)
943				break;
944
945			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
946				path->slots[0]++;
947				goto next_slot;
948			}
949
950			ret = btrfs_del_items(trans, root, path, del_slot,
951					      del_nr);
952			if (ret) {
953				btrfs_abort_transaction(trans, ret);
954				break;
955			}
956
957			del_nr = 0;
958			del_slot = 0;
959
960			btrfs_release_path(path);
961			continue;
962		}
963
964		BUG();
965	}
966
967	if (!ret && del_nr > 0) {
968		/*
969		 * Set path->slots[0] to first slot, so that after the delete
970		 * if items are move off from our leaf to its immediate left or
971		 * right neighbor leafs, we end up with a correct and adjusted
972		 * path->slots[0] for our insertion (if replace_extent != 0).
973		 */
974		path->slots[0] = del_slot;
975		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
976		if (ret)
977			btrfs_abort_transaction(trans, ret);
978	}
979
980	leaf = path->nodes[0];
981	/*
982	 * If btrfs_del_items() was called, it might have deleted a leaf, in
983	 * which case it unlocked our path, so check path->locks[0] matches a
984	 * write lock.
985	 */
986	if (!ret && replace_extent && leafs_visited == 1 &&
987	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
988	     path->locks[0] == BTRFS_WRITE_LOCK) &&
989	    btrfs_leaf_free_space(leaf) >=
990	    sizeof(struct btrfs_item) + extent_item_size) {
991
992		key.objectid = ino;
993		key.type = BTRFS_EXTENT_DATA_KEY;
994		key.offset = start;
995		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
996			struct btrfs_key slot_key;
997
998			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
999			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1000				path->slots[0]++;
1001		}
1002		setup_items_for_insert(root, path, &key, &extent_item_size, 1);
1003		*key_inserted = 1;
1004	}
1005
1006	if (!replace_extent || !(*key_inserted))
1007		btrfs_release_path(path);
1008	if (drop_end)
1009		*drop_end = found ? min(end, last_end) : end;
1010	return ret;
1011}
1012
1013int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1014		       struct btrfs_root *root, struct inode *inode, u64 start,
1015		       u64 end, int drop_cache)
1016{
1017	struct btrfs_path *path;
1018	int ret;
1019
1020	path = btrfs_alloc_path();
1021	if (!path)
1022		return -ENOMEM;
1023	ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
1024				   end, NULL, drop_cache, 0, 0, NULL);
1025	btrfs_free_path(path);
1026	return ret;
1027}
1028
1029static int extent_mergeable(struct extent_buffer *leaf, int slot,
1030			    u64 objectid, u64 bytenr, u64 orig_offset,
1031			    u64 *start, u64 *end)
1032{
1033	struct btrfs_file_extent_item *fi;
1034	struct btrfs_key key;
1035	u64 extent_end;
1036
1037	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1038		return 0;
1039
1040	btrfs_item_key_to_cpu(leaf, &key, slot);
1041	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1042		return 0;
1043
1044	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1045	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1046	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1047	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1048	    btrfs_file_extent_compression(leaf, fi) ||
1049	    btrfs_file_extent_encryption(leaf, fi) ||
1050	    btrfs_file_extent_other_encoding(leaf, fi))
1051		return 0;
1052
1053	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1054	if ((*start && *start != key.offset) || (*end && *end != extent_end))
1055		return 0;
1056
1057	*start = key.offset;
1058	*end = extent_end;
1059	return 1;
1060}
1061
1062/*
1063 * Mark extent in the range start - end as written.
1064 *
1065 * This changes extent type from 'pre-allocated' to 'regular'. If only
1066 * part of extent is marked as written, the extent will be split into
1067 * two or three.
1068 */
1069int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1070			      struct btrfs_inode *inode, u64 start, u64 end)
1071{
1072	struct btrfs_fs_info *fs_info = trans->fs_info;
1073	struct btrfs_root *root = inode->root;
1074	struct extent_buffer *leaf;
1075	struct btrfs_path *path;
1076	struct btrfs_file_extent_item *fi;
1077	struct btrfs_ref ref = { 0 };
1078	struct btrfs_key key;
1079	struct btrfs_key new_key;
1080	u64 bytenr;
1081	u64 num_bytes;
1082	u64 extent_end;
1083	u64 orig_offset;
1084	u64 other_start;
1085	u64 other_end;
1086	u64 split;
1087	int del_nr = 0;
1088	int del_slot = 0;
1089	int recow;
1090	int ret = 0;
1091	u64 ino = btrfs_ino(inode);
1092
1093	path = btrfs_alloc_path();
1094	if (!path)
1095		return -ENOMEM;
1096again:
1097	recow = 0;
1098	split = start;
1099	key.objectid = ino;
1100	key.type = BTRFS_EXTENT_DATA_KEY;
1101	key.offset = split;
1102
1103	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1104	if (ret < 0)
1105		goto out;
1106	if (ret > 0 && path->slots[0] > 0)
1107		path->slots[0]--;
1108
1109	leaf = path->nodes[0];
1110	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1111	if (key.objectid != ino ||
1112	    key.type != BTRFS_EXTENT_DATA_KEY) {
1113		ret = -EINVAL;
1114		btrfs_abort_transaction(trans, ret);
1115		goto out;
1116	}
1117	fi = btrfs_item_ptr(leaf, path->slots[0],
1118			    struct btrfs_file_extent_item);
1119	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1120		ret = -EINVAL;
1121		btrfs_abort_transaction(trans, ret);
1122		goto out;
1123	}
1124	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1125	if (key.offset > start || extent_end < end) {
1126		ret = -EINVAL;
1127		btrfs_abort_transaction(trans, ret);
1128		goto out;
1129	}
1130
1131	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1132	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1133	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1134	memcpy(&new_key, &key, sizeof(new_key));
1135
1136	if (start == key.offset && end < extent_end) {
1137		other_start = 0;
1138		other_end = start;
1139		if (extent_mergeable(leaf, path->slots[0] - 1,
1140				     ino, bytenr, orig_offset,
1141				     &other_start, &other_end)) {
1142			new_key.offset = end;
1143			btrfs_set_item_key_safe(fs_info, path, &new_key);
1144			fi = btrfs_item_ptr(leaf, path->slots[0],
1145					    struct btrfs_file_extent_item);
1146			btrfs_set_file_extent_generation(leaf, fi,
1147							 trans->transid);
1148			btrfs_set_file_extent_num_bytes(leaf, fi,
1149							extent_end - end);
1150			btrfs_set_file_extent_offset(leaf, fi,
1151						     end - orig_offset);
1152			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1153					    struct btrfs_file_extent_item);
1154			btrfs_set_file_extent_generation(leaf, fi,
1155							 trans->transid);
1156			btrfs_set_file_extent_num_bytes(leaf, fi,
1157							end - other_start);
1158			btrfs_mark_buffer_dirty(leaf);
1159			goto out;
1160		}
1161	}
1162
1163	if (start > key.offset && end == extent_end) {
1164		other_start = end;
1165		other_end = 0;
1166		if (extent_mergeable(leaf, path->slots[0] + 1,
1167				     ino, bytenr, orig_offset,
1168				     &other_start, &other_end)) {
1169			fi = btrfs_item_ptr(leaf, path->slots[0],
1170					    struct btrfs_file_extent_item);
1171			btrfs_set_file_extent_num_bytes(leaf, fi,
1172							start - key.offset);
1173			btrfs_set_file_extent_generation(leaf, fi,
1174							 trans->transid);
1175			path->slots[0]++;
1176			new_key.offset = start;
1177			btrfs_set_item_key_safe(fs_info, path, &new_key);
1178
1179			fi = btrfs_item_ptr(leaf, path->slots[0],
1180					    struct btrfs_file_extent_item);
1181			btrfs_set_file_extent_generation(leaf, fi,
1182							 trans->transid);
1183			btrfs_set_file_extent_num_bytes(leaf, fi,
1184							other_end - start);
1185			btrfs_set_file_extent_offset(leaf, fi,
1186						     start - orig_offset);
1187			btrfs_mark_buffer_dirty(leaf);
1188			goto out;
1189		}
1190	}
1191
1192	while (start > key.offset || end < extent_end) {
1193		if (key.offset == start)
1194			split = end;
1195
1196		new_key.offset = split;
1197		ret = btrfs_duplicate_item(trans, root, path, &new_key);
1198		if (ret == -EAGAIN) {
1199			btrfs_release_path(path);
1200			goto again;
1201		}
1202		if (ret < 0) {
1203			btrfs_abort_transaction(trans, ret);
1204			goto out;
1205		}
1206
1207		leaf = path->nodes[0];
1208		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1209				    struct btrfs_file_extent_item);
1210		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1211		btrfs_set_file_extent_num_bytes(leaf, fi,
1212						split - key.offset);
1213
1214		fi = btrfs_item_ptr(leaf, path->slots[0],
1215				    struct btrfs_file_extent_item);
1216
1217		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1218		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1219		btrfs_set_file_extent_num_bytes(leaf, fi,
1220						extent_end - split);
1221		btrfs_mark_buffer_dirty(leaf);
1222
1223		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1224				       num_bytes, 0);
1225		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1226				    orig_offset);
1227		ret = btrfs_inc_extent_ref(trans, &ref);
1228		if (ret) {
1229			btrfs_abort_transaction(trans, ret);
1230			goto out;
1231		}
1232
1233		if (split == start) {
1234			key.offset = start;
1235		} else {
1236			if (start != key.offset) {
1237				ret = -EINVAL;
1238				btrfs_abort_transaction(trans, ret);
1239				goto out;
1240			}
1241			path->slots[0]--;
1242			extent_end = end;
1243		}
1244		recow = 1;
1245	}
1246
1247	other_start = end;
1248	other_end = 0;
1249	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1250			       num_bytes, 0);
1251	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
1252	if (extent_mergeable(leaf, path->slots[0] + 1,
1253			     ino, bytenr, orig_offset,
1254			     &other_start, &other_end)) {
1255		if (recow) {
1256			btrfs_release_path(path);
1257			goto again;
1258		}
1259		extent_end = other_end;
1260		del_slot = path->slots[0] + 1;
1261		del_nr++;
1262		ret = btrfs_free_extent(trans, &ref);
1263		if (ret) {
1264			btrfs_abort_transaction(trans, ret);
1265			goto out;
1266		}
1267	}
1268	other_start = 0;
1269	other_end = start;
1270	if (extent_mergeable(leaf, path->slots[0] - 1,
1271			     ino, bytenr, orig_offset,
1272			     &other_start, &other_end)) {
1273		if (recow) {
1274			btrfs_release_path(path);
1275			goto again;
1276		}
1277		key.offset = other_start;
1278		del_slot = path->slots[0];
1279		del_nr++;
1280		ret = btrfs_free_extent(trans, &ref);
1281		if (ret) {
1282			btrfs_abort_transaction(trans, ret);
1283			goto out;
1284		}
1285	}
1286	if (del_nr == 0) {
1287		fi = btrfs_item_ptr(leaf, path->slots[0],
1288			   struct btrfs_file_extent_item);
1289		btrfs_set_file_extent_type(leaf, fi,
1290					   BTRFS_FILE_EXTENT_REG);
1291		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1292		btrfs_mark_buffer_dirty(leaf);
1293	} else {
1294		fi = btrfs_item_ptr(leaf, del_slot - 1,
1295			   struct btrfs_file_extent_item);
1296		btrfs_set_file_extent_type(leaf, fi,
1297					   BTRFS_FILE_EXTENT_REG);
1298		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1299		btrfs_set_file_extent_num_bytes(leaf, fi,
1300						extent_end - key.offset);
1301		btrfs_mark_buffer_dirty(leaf);
1302
1303		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1304		if (ret < 0) {
1305			btrfs_abort_transaction(trans, ret);
1306			goto out;
1307		}
1308	}
1309out:
1310	btrfs_free_path(path);
1311	return ret;
1312}
1313
1314/*
1315 * on error we return an unlocked page and the error value
1316 * on success we return a locked page and 0
1317 */
1318static int prepare_uptodate_page(struct inode *inode,
1319				 struct page *page, u64 pos,
1320				 bool force_uptodate)
1321{
1322	int ret = 0;
1323
1324	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1325	    !PageUptodate(page)) {
1326		ret = btrfs_readpage(NULL, page);
1327		if (ret)
1328			return ret;
1329		lock_page(page);
1330		if (!PageUptodate(page)) {
1331			unlock_page(page);
1332			return -EIO;
1333		}
1334		if (page->mapping != inode->i_mapping) {
1335			unlock_page(page);
1336			return -EAGAIN;
1337		}
1338	}
1339	return 0;
1340}
1341
1342/*
1343 * this just gets pages into the page cache and locks them down.
1344 */
1345static noinline int prepare_pages(struct inode *inode, struct page **pages,
1346				  size_t num_pages, loff_t pos,
1347				  size_t write_bytes, bool force_uptodate)
1348{
1349	int i;
1350	unsigned long index = pos >> PAGE_SHIFT;
1351	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1352	int err = 0;
1353	int faili;
1354
1355	for (i = 0; i < num_pages; i++) {
1356again:
1357		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1358					       mask | __GFP_WRITE);
1359		if (!pages[i]) {
1360			faili = i - 1;
1361			err = -ENOMEM;
1362			goto fail;
1363		}
1364
1365		if (i == 0)
1366			err = prepare_uptodate_page(inode, pages[i], pos,
1367						    force_uptodate);
1368		if (!err && i == num_pages - 1)
1369			err = prepare_uptodate_page(inode, pages[i],
1370						    pos + write_bytes, false);
1371		if (err) {
1372			put_page(pages[i]);
1373			if (err == -EAGAIN) {
1374				err = 0;
1375				goto again;
1376			}
1377			faili = i - 1;
1378			goto fail;
1379		}
1380		wait_on_page_writeback(pages[i]);
1381	}
1382
1383	return 0;
1384fail:
1385	while (faili >= 0) {
1386		unlock_page(pages[faili]);
1387		put_page(pages[faili]);
1388		faili--;
1389	}
1390	return err;
1391
1392}
1393
1394/*
1395 * This function locks the extent and properly waits for data=ordered extents
1396 * to finish before allowing the pages to be modified if need.
1397 *
1398 * The return value:
1399 * 1 - the extent is locked
1400 * 0 - the extent is not locked, and everything is OK
1401 * -EAGAIN - need re-prepare the pages
1402 * the other < 0 number - Something wrong happens
1403 */
1404static noinline int
1405lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1406				size_t num_pages, loff_t pos,
1407				size_t write_bytes,
1408				u64 *lockstart, u64 *lockend,
1409				struct extent_state **cached_state)
1410{
1411	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1412	u64 start_pos;
1413	u64 last_pos;
1414	int i;
1415	int ret = 0;
1416
1417	start_pos = round_down(pos, fs_info->sectorsize);
1418	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1419
1420	if (start_pos < inode->vfs_inode.i_size) {
1421		struct btrfs_ordered_extent *ordered;
1422
1423		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1424				cached_state);
1425		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1426						     last_pos - start_pos + 1);
1427		if (ordered &&
1428		    ordered->file_offset + ordered->num_bytes > start_pos &&
1429		    ordered->file_offset <= last_pos) {
1430			unlock_extent_cached(&inode->io_tree, start_pos,
1431					last_pos, cached_state);
1432			for (i = 0; i < num_pages; i++) {
1433				unlock_page(pages[i]);
1434				put_page(pages[i]);
1435			}
1436			btrfs_start_ordered_extent(ordered, 1);
1437			btrfs_put_ordered_extent(ordered);
1438			return -EAGAIN;
1439		}
1440		if (ordered)
1441			btrfs_put_ordered_extent(ordered);
1442
1443		*lockstart = start_pos;
1444		*lockend = last_pos;
1445		ret = 1;
1446	}
1447
1448	/*
1449	 * It's possible the pages are dirty right now, but we don't want
1450	 * to clean them yet because copy_from_user may catch a page fault
1451	 * and we might have to fall back to one page at a time.  If that
1452	 * happens, we'll unlock these pages and we'd have a window where
1453	 * reclaim could sneak in and drop the once-dirty page on the floor
1454	 * without writing it.
1455	 *
1456	 * We have the pages locked and the extent range locked, so there's
1457	 * no way someone can start IO on any dirty pages in this range.
1458	 *
1459	 * We'll call btrfs_dirty_pages() later on, and that will flip around
1460	 * delalloc bits and dirty the pages as required.
1461	 */
1462	for (i = 0; i < num_pages; i++) {
1463		set_page_extent_mapped(pages[i]);
1464		WARN_ON(!PageLocked(pages[i]));
1465	}
1466
1467	return ret;
1468}
1469
1470static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1471			   size_t *write_bytes, bool nowait)
1472{
1473	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1474	struct btrfs_root *root = inode->root;
1475	u64 lockstart, lockend;
1476	u64 num_bytes;
1477	int ret;
1478
1479	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1480		return 0;
1481
1482	if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1483		return -EAGAIN;
1484
1485	lockstart = round_down(pos, fs_info->sectorsize);
1486	lockend = round_up(pos + *write_bytes,
1487			   fs_info->sectorsize) - 1;
1488	num_bytes = lockend - lockstart + 1;
1489
1490	if (nowait) {
1491		struct btrfs_ordered_extent *ordered;
1492
1493		if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1494			return -EAGAIN;
1495
1496		ordered = btrfs_lookup_ordered_range(inode, lockstart,
1497						     num_bytes);
1498		if (ordered) {
1499			btrfs_put_ordered_extent(ordered);
1500			ret = -EAGAIN;
1501			goto out_unlock;
1502		}
1503	} else {
1504		btrfs_lock_and_flush_ordered_range(inode, lockstart,
1505						   lockend, NULL);
1506	}
1507
1508	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1509			NULL, NULL, NULL, false);
1510	if (ret <= 0) {
1511		ret = 0;
1512		if (!nowait)
1513			btrfs_drew_write_unlock(&root->snapshot_lock);
1514	} else {
1515		*write_bytes = min_t(size_t, *write_bytes ,
1516				     num_bytes - pos + lockstart);
1517	}
1518out_unlock:
1519	unlock_extent(&inode->io_tree, lockstart, lockend);
1520
1521	return ret;
1522}
1523
1524static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1525			      size_t *write_bytes)
1526{
1527	return check_can_nocow(inode, pos, write_bytes, true);
1528}
1529
1530/*
1531 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1532 *
1533 * @pos:	 File offset
1534 * @write_bytes: The length to write, will be updated to the nocow writeable
1535 *		 range
1536 *
1537 * This function will flush ordered extents in the range to ensure proper
1538 * nocow checks.
1539 *
1540 * Return:
1541 * >0		and update @write_bytes if we can do nocow write
1542 *  0		if we can't do nocow write
1543 * -EAGAIN	if we can't get the needed lock or there are ordered extents
1544 * 		for * (nowait == true) case
1545 * <0		if other error happened
1546 *
1547 * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1548 */
1549int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1550			   size_t *write_bytes)
1551{
1552	return check_can_nocow(inode, pos, write_bytes, false);
1553}
1554
1555void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1556{
1557	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1558}
1559
1560static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1561					       struct iov_iter *i)
1562{
1563	struct file *file = iocb->ki_filp;
1564	loff_t pos = iocb->ki_pos;
1565	struct inode *inode = file_inode(file);
1566	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1567	struct page **pages = NULL;
1568	struct extent_changeset *data_reserved = NULL;
1569	u64 release_bytes = 0;
1570	u64 lockstart;
1571	u64 lockend;
1572	size_t num_written = 0;
1573	int nrptrs;
1574	int ret = 0;
1575	bool only_release_metadata = false;
1576	bool force_page_uptodate = false;
1577
1578	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1579			PAGE_SIZE / (sizeof(struct page *)));
1580	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1581	nrptrs = max(nrptrs, 8);
1582	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1583	if (!pages)
1584		return -ENOMEM;
1585
1586	while (iov_iter_count(i) > 0) {
1587		struct extent_state *cached_state = NULL;
1588		size_t offset = offset_in_page(pos);
1589		size_t sector_offset;
1590		size_t write_bytes = min(iov_iter_count(i),
1591					 nrptrs * (size_t)PAGE_SIZE -
1592					 offset);
1593		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1594						PAGE_SIZE);
1595		size_t reserve_bytes;
1596		size_t dirty_pages;
1597		size_t copied;
1598		size_t dirty_sectors;
1599		size_t num_sectors;
1600		int extents_locked;
1601
1602		WARN_ON(num_pages > nrptrs);
1603
1604		/*
1605		 * Fault pages before locking them in prepare_pages
1606		 * to avoid recursive lock
1607		 */
1608		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1609			ret = -EFAULT;
1610			break;
1611		}
1612
1613		only_release_metadata = false;
1614		sector_offset = pos & (fs_info->sectorsize - 1);
1615		reserve_bytes = round_up(write_bytes + sector_offset,
1616				fs_info->sectorsize);
1617
1618		extent_changeset_release(data_reserved);
1619		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1620						  &data_reserved, pos,
1621						  write_bytes);
1622		if (ret < 0) {
1623			if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1624						   &write_bytes) > 0) {
1625				/*
1626				 * For nodata cow case, no need to reserve
1627				 * data space.
1628				 */
1629				only_release_metadata = true;
1630				/*
1631				 * our prealloc extent may be smaller than
1632				 * write_bytes, so scale down.
1633				 */
1634				num_pages = DIV_ROUND_UP(write_bytes + offset,
1635							 PAGE_SIZE);
1636				reserve_bytes = round_up(write_bytes +
1637							 sector_offset,
1638							 fs_info->sectorsize);
1639			} else {
1640				break;
1641			}
1642		}
1643
1644		WARN_ON(reserve_bytes == 0);
1645		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1646				reserve_bytes);
1647		if (ret) {
1648			if (!only_release_metadata)
1649				btrfs_free_reserved_data_space(BTRFS_I(inode),
1650						data_reserved, pos,
1651						write_bytes);
1652			else
1653				btrfs_check_nocow_unlock(BTRFS_I(inode));
1654			break;
1655		}
1656
1657		release_bytes = reserve_bytes;
1658again:
1659		/*
1660		 * This is going to setup the pages array with the number of
1661		 * pages we want, so we don't really need to worry about the
1662		 * contents of pages from loop to loop
1663		 */
1664		ret = prepare_pages(inode, pages, num_pages,
1665				    pos, write_bytes,
1666				    force_page_uptodate);
1667		if (ret) {
1668			btrfs_delalloc_release_extents(BTRFS_I(inode),
1669						       reserve_bytes);
1670			break;
1671		}
1672
1673		extents_locked = lock_and_cleanup_extent_if_need(
1674				BTRFS_I(inode), pages,
1675				num_pages, pos, write_bytes, &lockstart,
1676				&lockend, &cached_state);
1677		if (extents_locked < 0) {
1678			if (extents_locked == -EAGAIN)
1679				goto again;
1680			btrfs_delalloc_release_extents(BTRFS_I(inode),
1681						       reserve_bytes);
1682			ret = extents_locked;
1683			break;
1684		}
1685
1686		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1687
1688		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1689		dirty_sectors = round_up(copied + sector_offset,
1690					fs_info->sectorsize);
1691		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1692
1693		/*
1694		 * if we have trouble faulting in the pages, fall
1695		 * back to one page at a time
1696		 */
1697		if (copied < write_bytes)
1698			nrptrs = 1;
1699
1700		if (copied == 0) {
1701			force_page_uptodate = true;
1702			dirty_sectors = 0;
1703			dirty_pages = 0;
1704		} else {
1705			force_page_uptodate = false;
1706			dirty_pages = DIV_ROUND_UP(copied + offset,
1707						   PAGE_SIZE);
1708		}
1709
1710		if (num_sectors > dirty_sectors) {
1711			/* release everything except the sectors we dirtied */
1712			release_bytes -= dirty_sectors <<
1713						fs_info->sb->s_blocksize_bits;
1714			if (only_release_metadata) {
1715				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1716							release_bytes, true);
1717			} else {
1718				u64 __pos;
1719
1720				__pos = round_down(pos,
1721						   fs_info->sectorsize) +
1722					(dirty_pages << PAGE_SHIFT);
1723				btrfs_delalloc_release_space(BTRFS_I(inode),
1724						data_reserved, __pos,
1725						release_bytes, true);
1726			}
1727		}
1728
1729		release_bytes = round_up(copied + sector_offset,
1730					fs_info->sectorsize);
1731
1732		if (copied > 0)
1733			ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1734						dirty_pages, pos, copied,
1735						&cached_state);
1736
1737		/*
1738		 * If we have not locked the extent range, because the range's
1739		 * start offset is >= i_size, we might still have a non-NULL
1740		 * cached extent state, acquired while marking the extent range
1741		 * as delalloc through btrfs_dirty_pages(). Therefore free any
1742		 * possible cached extent state to avoid a memory leak.
1743		 */
1744		if (extents_locked)
1745			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1746					     lockstart, lockend, &cached_state);
1747		else
1748			free_extent_state(cached_state);
1749
1750		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1751		if (ret) {
1752			btrfs_drop_pages(pages, num_pages);
1753			break;
1754		}
1755
1756		release_bytes = 0;
1757		if (only_release_metadata)
1758			btrfs_check_nocow_unlock(BTRFS_I(inode));
1759
1760		if (only_release_metadata && copied > 0) {
1761			lockstart = round_down(pos,
1762					       fs_info->sectorsize);
1763			lockend = round_up(pos + copied,
1764					   fs_info->sectorsize) - 1;
1765
1766			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1767				       lockend, EXTENT_NORESERVE, NULL,
1768				       NULL, GFP_NOFS);
1769		}
1770
1771		btrfs_drop_pages(pages, num_pages);
1772
1773		cond_resched();
1774
1775		balance_dirty_pages_ratelimited(inode->i_mapping);
1776
1777		pos += copied;
1778		num_written += copied;
1779	}
1780
1781	kfree(pages);
1782
1783	if (release_bytes) {
1784		if (only_release_metadata) {
1785			btrfs_check_nocow_unlock(BTRFS_I(inode));
1786			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1787					release_bytes, true);
1788		} else {
1789			btrfs_delalloc_release_space(BTRFS_I(inode),
1790					data_reserved,
1791					round_down(pos, fs_info->sectorsize),
1792					release_bytes, true);
1793		}
1794	}
1795
1796	extent_changeset_free(data_reserved);
1797	return num_written ? num_written : ret;
1798}
1799
1800static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1801{
1802	struct file *file = iocb->ki_filp;
1803	struct inode *inode = file_inode(file);
1804	loff_t pos;
1805	ssize_t written;
1806	ssize_t written_buffered;
1807	loff_t endbyte;
1808	int err;
1809
1810	written = btrfs_direct_IO(iocb, from);
1811
1812	if (written < 0 || !iov_iter_count(from))
1813		return written;
1814
1815	pos = iocb->ki_pos;
1816	written_buffered = btrfs_buffered_write(iocb, from);
1817	if (written_buffered < 0) {
1818		err = written_buffered;
1819		goto out;
1820	}
1821	/*
1822	 * Ensure all data is persisted. We want the next direct IO read to be
1823	 * able to read what was just written.
1824	 */
1825	endbyte = pos + written_buffered - 1;
1826	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1827	if (err)
1828		goto out;
1829	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1830	if (err)
1831		goto out;
1832	written += written_buffered;
1833	iocb->ki_pos = pos + written_buffered;
1834	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1835				 endbyte >> PAGE_SHIFT);
1836out:
1837	return written ? written : err;
1838}
1839
1840static void update_time_for_write(struct inode *inode)
1841{
1842	struct timespec64 now;
1843
1844	if (IS_NOCMTIME(inode))
1845		return;
1846
1847	now = current_time(inode);
1848	if (!timespec64_equal(&inode->i_mtime, &now))
1849		inode->i_mtime = now;
1850
1851	if (!timespec64_equal(&inode->i_ctime, &now))
1852		inode->i_ctime = now;
1853
1854	if (IS_I_VERSION(inode))
1855		inode_inc_iversion(inode);
1856}
1857
1858static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1859				    struct iov_iter *from)
1860{
1861	struct file *file = iocb->ki_filp;
1862	struct inode *inode = file_inode(file);
1863	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1864	u64 start_pos;
1865	u64 end_pos;
1866	ssize_t num_written = 0;
1867	const bool sync = iocb->ki_flags & IOCB_DSYNC;
1868	ssize_t err;
1869	loff_t pos;
1870	size_t count;
1871	loff_t oldsize;
1872	int clean_page = 0;
1873
1874	if (!(iocb->ki_flags & IOCB_DIRECT) &&
1875	    (iocb->ki_flags & IOCB_NOWAIT))
1876		return -EOPNOTSUPP;
1877
1878	if (iocb->ki_flags & IOCB_NOWAIT) {
1879		if (!inode_trylock(inode))
1880			return -EAGAIN;
1881	} else {
1882		inode_lock(inode);
1883	}
1884
1885	err = generic_write_checks(iocb, from);
1886	if (err <= 0) {
1887		inode_unlock(inode);
1888		return err;
1889	}
1890
1891	pos = iocb->ki_pos;
1892	count = iov_iter_count(from);
1893	if (iocb->ki_flags & IOCB_NOWAIT) {
1894		size_t nocow_bytes = count;
1895
1896		/*
1897		 * We will allocate space in case nodatacow is not set,
1898		 * so bail
1899		 */
1900		if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
1901		    <= 0) {
1902			inode_unlock(inode);
1903			return -EAGAIN;
1904		}
1905		/*
1906		 * There are holes in the range or parts of the range that must
1907		 * be COWed (shared extents, RO block groups, etc), so just bail
1908		 * out.
1909		 */
1910		if (nocow_bytes < count) {
1911			inode_unlock(inode);
1912			return -EAGAIN;
1913		}
1914	}
1915
1916	current->backing_dev_info = inode_to_bdi(inode);
1917	err = file_remove_privs(file);
1918	if (err) {
1919		inode_unlock(inode);
1920		goto out;
1921	}
1922
1923	/*
1924	 * If BTRFS flips readonly due to some impossible error
1925	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1926	 * although we have opened a file as writable, we have
1927	 * to stop this write operation to ensure FS consistency.
1928	 */
1929	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
1930		inode_unlock(inode);
1931		err = -EROFS;
1932		goto out;
1933	}
1934
1935	/*
1936	 * We reserve space for updating the inode when we reserve space for the
1937	 * extent we are going to write, so we will enospc out there.  We don't
1938	 * need to start yet another transaction to update the inode as we will
1939	 * update the inode when we finish writing whatever data we write.
1940	 */
1941	update_time_for_write(inode);
1942
1943	start_pos = round_down(pos, fs_info->sectorsize);
1944	oldsize = i_size_read(inode);
1945	if (start_pos > oldsize) {
1946		/* Expand hole size to cover write data, preventing empty gap */
1947		end_pos = round_up(pos + count,
1948				   fs_info->sectorsize);
1949		err = btrfs_cont_expand(inode, oldsize, end_pos);
1950		if (err) {
1951			inode_unlock(inode);
1952			goto out;
1953		}
1954		if (start_pos > round_up(oldsize, fs_info->sectorsize))
1955			clean_page = 1;
1956	}
1957
1958	if (sync)
1959		atomic_inc(&BTRFS_I(inode)->sync_writers);
1960
1961	if (iocb->ki_flags & IOCB_DIRECT) {
1962		/*
1963		 * 1. We must always clear IOCB_DSYNC in order to not deadlock
1964		 *    in iomap, as it calls generic_write_sync() in this case.
1965		 * 2. If we are async, we can call iomap_dio_complete() either
1966		 *    in
1967		 *
1968		 *    2.1. A worker thread from the last bio completed.  In this
1969		 *	   case we need to mark the btrfs_dio_data that it is
1970		 *	   async in order to call generic_write_sync() properly.
1971		 *	   This is handled by setting BTRFS_DIO_SYNC_STUB in the
1972		 *	   current->journal_info.
1973		 *    2.2  The submitter context, because all IO completed
1974		 *         before we exited iomap_dio_rw().  In this case we can
1975		 *         just re-set the IOCB_DSYNC on the iocb and we'll do
1976		 *         the sync below.  If our ->end_io() gets called and
1977		 *         current->journal_info is set, then we know we're in
1978		 *         our current context and we will clear
1979		 *         current->journal_info to indicate that we need to
1980		 *         sync below.
1981		 */
1982		if (sync) {
1983			ASSERT(current->journal_info == NULL);
1984			iocb->ki_flags &= ~IOCB_DSYNC;
1985			current->journal_info = BTRFS_DIO_SYNC_STUB;
1986		}
1987		num_written = __btrfs_direct_write(iocb, from);
1988
1989		/*
1990		 * As stated above, we cleared journal_info, so we need to do
1991		 * the sync ourselves.
1992		 */
1993		if (sync && current->journal_info == NULL)
1994			iocb->ki_flags |= IOCB_DSYNC;
1995		current->journal_info = NULL;
1996	} else {
1997		num_written = btrfs_buffered_write(iocb, from);
1998		if (num_written > 0)
1999			iocb->ki_pos = pos + num_written;
2000		if (clean_page)
2001			pagecache_isize_extended(inode, oldsize,
2002						i_size_read(inode));
2003	}
2004
2005	inode_unlock(inode);
2006
2007	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
2008
2009	if (num_written > 0)
2010		num_written = generic_write_sync(iocb, num_written);
2011
2012	if (sync)
2013		atomic_dec(&BTRFS_I(inode)->sync_writers);
2014out:
2015	current->backing_dev_info = NULL;
2016	return num_written ? num_written : err;
2017}
2018
2019int btrfs_release_file(struct inode *inode, struct file *filp)
2020{
2021	struct btrfs_file_private *private = filp->private_data;
2022
2023	if (private && private->filldir_buf)
2024		kfree(private->filldir_buf);
2025	kfree(private);
2026	filp->private_data = NULL;
2027
2028	/*
2029	 * Set by setattr when we are about to truncate a file from a non-zero
2030	 * size to a zero size.  This tries to flush down new bytes that may
2031	 * have been written if the application were using truncate to replace
2032	 * a file in place.
2033	 */
2034	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2035			       &BTRFS_I(inode)->runtime_flags))
2036			filemap_flush(inode->i_mapping);
2037	return 0;
2038}
2039
2040static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2041{
2042	int ret;
2043	struct blk_plug plug;
2044
2045	/*
2046	 * This is only called in fsync, which would do synchronous writes, so
2047	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
2048	 * multiple disks using raid profile, a large IO can be split to
2049	 * several segments of stripe length (currently 64K).
2050	 */
2051	blk_start_plug(&plug);
2052	atomic_inc(&BTRFS_I(inode)->sync_writers);
2053	ret = btrfs_fdatawrite_range(inode, start, end);
2054	atomic_dec(&BTRFS_I(inode)->sync_writers);
2055	blk_finish_plug(&plug);
2056
2057	return ret;
2058}
2059
2060static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2061{
2062	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2063	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2064
2065	if (btrfs_inode_in_log(inode, fs_info->generation) &&
2066	    list_empty(&ctx->ordered_extents))
2067		return true;
2068
2069	/*
2070	 * If we are doing a fast fsync we can not bail out if the inode's
2071	 * last_trans is <= then the last committed transaction, because we only
2072	 * update the last_trans of the inode during ordered extent completion,
2073	 * and for a fast fsync we don't wait for that, we only wait for the
2074	 * writeback to complete.
2075	 */
2076	if (inode->last_trans <= fs_info->last_trans_committed &&
2077	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2078	     list_empty(&ctx->ordered_extents)))
2079		return true;
2080
2081	return false;
2082}
2083
2084/*
2085 * fsync call for both files and directories.  This logs the inode into
2086 * the tree log instead of forcing full commits whenever possible.
2087 *
2088 * It needs to call filemap_fdatawait so that all ordered extent updates are
2089 * in the metadata btree are up to date for copying to the log.
2090 *
2091 * It drops the inode mutex before doing the tree log commit.  This is an
2092 * important optimization for directories because holding the mutex prevents
2093 * new operations on the dir while we write to disk.
2094 */
2095int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2096{
2097	struct dentry *dentry = file_dentry(file);
2098	struct inode *inode = d_inode(dentry);
2099	struct btrfs_root *root = BTRFS_I(inode)->root;
2100	struct btrfs_trans_handle *trans;
2101	struct btrfs_log_ctx ctx;
2102	int ret = 0, err;
2103	u64 len;
2104	bool full_sync;
2105
2106	trace_btrfs_sync_file(file, datasync);
2107
2108	btrfs_init_log_ctx(&ctx, inode);
2109
2110	/*
2111	 * Always set the range to a full range, otherwise we can get into
2112	 * several problems, from missing file extent items to represent holes
2113	 * when not using the NO_HOLES feature, to log tree corruption due to
2114	 * races between hole detection during logging and completion of ordered
2115	 * extents outside the range, to missing checksums due to ordered extents
2116	 * for which we flushed only a subset of their pages.
2117	 */
2118	start = 0;
2119	end = LLONG_MAX;
2120	len = (u64)LLONG_MAX + 1;
2121
2122	/*
2123	 * We write the dirty pages in the range and wait until they complete
2124	 * out of the ->i_mutex. If so, we can flush the dirty pages by
2125	 * multi-task, and make the performance up.  See
2126	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2127	 */
2128	ret = start_ordered_ops(inode, start, end);
2129	if (ret)
2130		goto out;
2131
2132	inode_lock(inode);
2133
2134	/*
2135	 * We take the dio_sem here because the tree log stuff can race with
2136	 * lockless dio writes and get an extent map logged for an extent we
2137	 * never waited on.  We need it this high up for lockdep reasons.
2138	 */
2139	down_write(&BTRFS_I(inode)->dio_sem);
2140
2141	atomic_inc(&root->log_batch);
2142
2143	/*
2144	 * Always check for the full sync flag while holding the inode's lock,
2145	 * to avoid races with other tasks. The flag must be either set all the
2146	 * time during logging or always off all the time while logging.
2147	 */
2148	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2149			     &BTRFS_I(inode)->runtime_flags);
2150
2151	/*
2152	 * Before we acquired the inode's lock, someone may have dirtied more
2153	 * pages in the target range. We need to make sure that writeback for
2154	 * any such pages does not start while we are logging the inode, because
2155	 * if it does, any of the following might happen when we are not doing a
2156	 * full inode sync:
2157	 *
2158	 * 1) We log an extent after its writeback finishes but before its
2159	 *    checksums are added to the csum tree, leading to -EIO errors
2160	 *    when attempting to read the extent after a log replay.
2161	 *
2162	 * 2) We can end up logging an extent before its writeback finishes.
2163	 *    Therefore after the log replay we will have a file extent item
2164	 *    pointing to an unwritten extent (and no data checksums as well).
2165	 *
2166	 * So trigger writeback for any eventual new dirty pages and then we
2167	 * wait for all ordered extents to complete below.
2168	 */
2169	ret = start_ordered_ops(inode, start, end);
2170	if (ret) {
2171		up_write(&BTRFS_I(inode)->dio_sem);
2172		inode_unlock(inode);
2173		goto out;
2174	}
2175
2176	/*
2177	 * We have to do this here to avoid the priority inversion of waiting on
2178	 * IO of a lower priority task while holding a transaction open.
2179	 *
2180	 * For a full fsync we wait for the ordered extents to complete while
2181	 * for a fast fsync we wait just for writeback to complete, and then
2182	 * attach the ordered extents to the transaction so that a transaction
2183	 * commit waits for their completion, to avoid data loss if we fsync,
2184	 * the current transaction commits before the ordered extents complete
2185	 * and a power failure happens right after that.
2186	 */
2187	if (full_sync) {
2188		ret = btrfs_wait_ordered_range(inode, start, len);
2189	} else {
2190		/*
2191		 * Get our ordered extents as soon as possible to avoid doing
2192		 * checksum lookups in the csum tree, and use instead the
2193		 * checksums attached to the ordered extents.
2194		 */
2195		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2196						      &ctx.ordered_extents);
2197		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2198	}
2199
2200	if (ret)
2201		goto out_release_extents;
2202
2203	atomic_inc(&root->log_batch);
2204
2205	smp_mb();
2206	if (skip_inode_logging(&ctx)) {
2207		/*
2208		 * We've had everything committed since the last time we were
2209		 * modified so clear this flag in case it was set for whatever
2210		 * reason, it's no longer relevant.
2211		 */
2212		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2213			  &BTRFS_I(inode)->runtime_flags);
2214		/*
2215		 * An ordered extent might have started before and completed
2216		 * already with io errors, in which case the inode was not
2217		 * updated and we end up here. So check the inode's mapping
2218		 * for any errors that might have happened since we last
2219		 * checked called fsync.
2220		 */
2221		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2222		goto out_release_extents;
2223	}
2224
2225	/*
2226	 * We use start here because we will need to wait on the IO to complete
2227	 * in btrfs_sync_log, which could require joining a transaction (for
2228	 * example checking cross references in the nocow path).  If we use join
2229	 * here we could get into a situation where we're waiting on IO to
2230	 * happen that is blocked on a transaction trying to commit.  With start
2231	 * we inc the extwriter counter, so we wait for all extwriters to exit
2232	 * before we start blocking joiners.  This comment is to keep somebody
2233	 * from thinking they are super smart and changing this to
2234	 * btrfs_join_transaction *cough*Josef*cough*.
2235	 */
2236	trans = btrfs_start_transaction(root, 0);
2237	if (IS_ERR(trans)) {
2238		ret = PTR_ERR(trans);
2239		goto out_release_extents;
2240	}
2241
2242	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2243	btrfs_release_log_ctx_extents(&ctx);
2244	if (ret < 0) {
2245		/* Fallthrough and commit/free transaction. */
2246		ret = 1;
2247	}
2248
2249	/* we've logged all the items and now have a consistent
2250	 * version of the file in the log.  It is possible that
2251	 * someone will come in and modify the file, but that's
2252	 * fine because the log is consistent on disk, and we
2253	 * have references to all of the file's extents
2254	 *
2255	 * It is possible that someone will come in and log the
2256	 * file again, but that will end up using the synchronization
2257	 * inside btrfs_sync_log to keep things safe.
2258	 */
2259	up_write(&BTRFS_I(inode)->dio_sem);
2260	inode_unlock(inode);
2261
2262	if (ret != BTRFS_NO_LOG_SYNC) {
2263		if (!ret) {
2264			ret = btrfs_sync_log(trans, root, &ctx);
2265			if (!ret) {
2266				ret = btrfs_end_transaction(trans);
2267				goto out;
2268			}
2269		}
2270		if (!full_sync) {
2271			ret = btrfs_wait_ordered_range(inode, start, len);
2272			if (ret) {
2273				btrfs_end_transaction(trans);
2274				goto out;
2275			}
2276		}
2277		ret = btrfs_commit_transaction(trans);
2278	} else {
2279		ret = btrfs_end_transaction(trans);
2280	}
2281out:
2282	ASSERT(list_empty(&ctx.list));
2283	err = file_check_and_advance_wb_err(file);
2284	if (!ret)
2285		ret = err;
2286	return ret > 0 ? -EIO : ret;
2287
2288out_release_extents:
2289	btrfs_release_log_ctx_extents(&ctx);
2290	up_write(&BTRFS_I(inode)->dio_sem);
2291	inode_unlock(inode);
2292	goto out;
2293}
2294
2295static const struct vm_operations_struct btrfs_file_vm_ops = {
2296	.fault		= filemap_fault,
2297	.map_pages	= filemap_map_pages,
2298	.page_mkwrite	= btrfs_page_mkwrite,
2299};
2300
2301static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2302{
2303	struct address_space *mapping = filp->f_mapping;
2304
2305	if (!mapping->a_ops->readpage)
2306		return -ENOEXEC;
2307
2308	file_accessed(filp);
2309	vma->vm_ops = &btrfs_file_vm_ops;
2310
2311	return 0;
2312}
2313
2314static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2315			  int slot, u64 start, u64 end)
2316{
2317	struct btrfs_file_extent_item *fi;
2318	struct btrfs_key key;
2319
2320	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2321		return 0;
2322
2323	btrfs_item_key_to_cpu(leaf, &key, slot);
2324	if (key.objectid != btrfs_ino(inode) ||
2325	    key.type != BTRFS_EXTENT_DATA_KEY)
2326		return 0;
2327
2328	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2329
2330	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2331		return 0;
2332
2333	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2334		return 0;
2335
2336	if (key.offset == end)
2337		return 1;
2338	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2339		return 1;
2340	return 0;
2341}
2342
2343static int fill_holes(struct btrfs_trans_handle *trans,
2344		struct btrfs_inode *inode,
2345		struct btrfs_path *path, u64 offset, u64 end)
2346{
2347	struct btrfs_fs_info *fs_info = trans->fs_info;
2348	struct btrfs_root *root = inode->root;
2349	struct extent_buffer *leaf;
2350	struct btrfs_file_extent_item *fi;
2351	struct extent_map *hole_em;
2352	struct extent_map_tree *em_tree = &inode->extent_tree;
2353	struct btrfs_key key;
2354	int ret;
2355
2356	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2357		goto out;
2358
2359	key.objectid = btrfs_ino(inode);
2360	key.type = BTRFS_EXTENT_DATA_KEY;
2361	key.offset = offset;
2362
2363	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2364	if (ret <= 0) {
2365		/*
2366		 * We should have dropped this offset, so if we find it then
2367		 * something has gone horribly wrong.
2368		 */
2369		if (ret == 0)
2370			ret = -EINVAL;
2371		return ret;
2372	}
2373
2374	leaf = path->nodes[0];
2375	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2376		u64 num_bytes;
2377
2378		path->slots[0]--;
2379		fi = btrfs_item_ptr(leaf, path->slots[0],
2380				    struct btrfs_file_extent_item);
2381		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2382			end - offset;
2383		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2384		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2385		btrfs_set_file_extent_offset(leaf, fi, 0);
2386		btrfs_mark_buffer_dirty(leaf);
2387		goto out;
2388	}
2389
2390	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2391		u64 num_bytes;
2392
2393		key.offset = offset;
2394		btrfs_set_item_key_safe(fs_info, path, &key);
2395		fi = btrfs_item_ptr(leaf, path->slots[0],
2396				    struct btrfs_file_extent_item);
2397		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2398			offset;
2399		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2400		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2401		btrfs_set_file_extent_offset(leaf, fi, 0);
2402		btrfs_mark_buffer_dirty(leaf);
2403		goto out;
2404	}
2405	btrfs_release_path(path);
2406
2407	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2408			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2409	if (ret)
2410		return ret;
2411
2412out:
2413	btrfs_release_path(path);
2414
2415	hole_em = alloc_extent_map();
2416	if (!hole_em) {
2417		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2418		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2419	} else {
2420		hole_em->start = offset;
2421		hole_em->len = end - offset;
2422		hole_em->ram_bytes = hole_em->len;
2423		hole_em->orig_start = offset;
2424
2425		hole_em->block_start = EXTENT_MAP_HOLE;
2426		hole_em->block_len = 0;
2427		hole_em->orig_block_len = 0;
2428		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2429		hole_em->generation = trans->transid;
2430
2431		do {
2432			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2433			write_lock(&em_tree->lock);
2434			ret = add_extent_mapping(em_tree, hole_em, 1);
2435			write_unlock(&em_tree->lock);
2436		} while (ret == -EEXIST);
2437		free_extent_map(hole_em);
2438		if (ret)
2439			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2440					&inode->runtime_flags);
2441	}
2442
2443	return 0;
2444}
2445
2446/*
2447 * Find a hole extent on given inode and change start/len to the end of hole
2448 * extent.(hole/vacuum extent whose em->start <= start &&
2449 *	   em->start + em->len > start)
2450 * When a hole extent is found, return 1 and modify start/len.
2451 */
2452static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2453{
2454	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2455	struct extent_map *em;
2456	int ret = 0;
2457
2458	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
2459			      round_down(*start, fs_info->sectorsize),
2460			      round_up(*len, fs_info->sectorsize));
2461	if (IS_ERR(em))
2462		return PTR_ERR(em);
2463
2464	/* Hole or vacuum extent(only exists in no-hole mode) */
2465	if (em->block_start == EXTENT_MAP_HOLE) {
2466		ret = 1;
2467		*len = em->start + em->len > *start + *len ?
2468		       0 : *start + *len - em->start - em->len;
2469		*start = em->start + em->len;
2470	}
2471	free_extent_map(em);
2472	return ret;
2473}
2474
2475static int btrfs_punch_hole_lock_range(struct inode *inode,
2476				       const u64 lockstart,
2477				       const u64 lockend,
2478				       struct extent_state **cached_state)
2479{
2480	while (1) {
2481		struct btrfs_ordered_extent *ordered;
2482		int ret;
2483
2484		truncate_pagecache_range(inode, lockstart, lockend);
2485
2486		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2487				 cached_state);
2488		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2489							    lockend);
2490
2491		/*
2492		 * We need to make sure we have no ordered extents in this range
2493		 * and nobody raced in and read a page in this range, if we did
2494		 * we need to try again.
2495		 */
2496		if ((!ordered ||
2497		    (ordered->file_offset + ordered->num_bytes <= lockstart ||
2498		     ordered->file_offset > lockend)) &&
2499		     !filemap_range_has_page(inode->i_mapping,
2500					     lockstart, lockend)) {
2501			if (ordered)
2502				btrfs_put_ordered_extent(ordered);
2503			break;
2504		}
2505		if (ordered)
2506			btrfs_put_ordered_extent(ordered);
2507		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2508				     lockend, cached_state);
2509		ret = btrfs_wait_ordered_range(inode, lockstart,
2510					       lockend - lockstart + 1);
2511		if (ret)
2512			return ret;
2513	}
2514	return 0;
2515}
2516
2517static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2518				     struct inode *inode,
2519				     struct btrfs_path *path,
2520				     struct btrfs_replace_extent_info *extent_info,
2521				     const u64 replace_len)
2522{
2523	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2524	struct btrfs_root *root = BTRFS_I(inode)->root;
2525	struct btrfs_file_extent_item *extent;
2526	struct extent_buffer *leaf;
2527	struct btrfs_key key;
2528	int slot;
2529	struct btrfs_ref ref = { 0 };
2530	int ret;
2531
2532	if (replace_len == 0)
2533		return 0;
2534
2535	if (extent_info->disk_offset == 0 &&
2536	    btrfs_fs_incompat(fs_info, NO_HOLES))
2537		return 0;
2538
2539	key.objectid = btrfs_ino(BTRFS_I(inode));
2540	key.type = BTRFS_EXTENT_DATA_KEY;
2541	key.offset = extent_info->file_offset;
2542	ret = btrfs_insert_empty_item(trans, root, path, &key,
2543				      sizeof(struct btrfs_file_extent_item));
2544	if (ret)
2545		return ret;
2546	leaf = path->nodes[0];
2547	slot = path->slots[0];
2548	write_extent_buffer(leaf, extent_info->extent_buf,
2549			    btrfs_item_ptr_offset(leaf, slot),
2550			    sizeof(struct btrfs_file_extent_item));
2551	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2552	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2553	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2554	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2555	if (extent_info->is_new_extent)
2556		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2557	btrfs_mark_buffer_dirty(leaf);
2558	btrfs_release_path(path);
2559
2560	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
2561			extent_info->file_offset, replace_len);
2562	if (ret)
2563		return ret;
2564
2565	/* If it's a hole, nothing more needs to be done. */
2566	if (extent_info->disk_offset == 0)
2567		return 0;
2568
2569	inode_add_bytes(inode, replace_len);
2570
2571	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2572		key.objectid = extent_info->disk_offset;
2573		key.type = BTRFS_EXTENT_ITEM_KEY;
2574		key.offset = extent_info->disk_len;
2575		ret = btrfs_alloc_reserved_file_extent(trans, root,
2576						       btrfs_ino(BTRFS_I(inode)),
2577						       extent_info->file_offset,
2578						       extent_info->qgroup_reserved,
2579						       &key);
2580	} else {
2581		u64 ref_offset;
2582
2583		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2584				       extent_info->disk_offset,
2585				       extent_info->disk_len, 0);
2586		ref_offset = extent_info->file_offset - extent_info->data_offset;
2587		btrfs_init_data_ref(&ref, root->root_key.objectid,
2588				    btrfs_ino(BTRFS_I(inode)), ref_offset);
2589		ret = btrfs_inc_extent_ref(trans, &ref);
2590	}
2591
2592	extent_info->insertions++;
2593
2594	return ret;
2595}
2596
2597/*
2598 * The respective range must have been previously locked, as well as the inode.
2599 * The end offset is inclusive (last byte of the range).
2600 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2601 * the file range with an extent.
2602 * When not punching a hole, we don't want to end up in a state where we dropped
2603 * extents without inserting a new one, so we must abort the transaction to avoid
2604 * a corruption.
2605 */
2606int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
2607			   const u64 start, const u64 end,
2608			   struct btrfs_replace_extent_info *extent_info,
2609			   struct btrfs_trans_handle **trans_out)
2610{
2611	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2612	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2613	u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2614	struct btrfs_root *root = BTRFS_I(inode)->root;
2615	struct btrfs_trans_handle *trans = NULL;
2616	struct btrfs_block_rsv *rsv;
2617	unsigned int rsv_count;
2618	u64 cur_offset;
2619	u64 drop_end;
2620	u64 len = end - start;
2621	int ret = 0;
2622
2623	if (end <= start)
2624		return -EINVAL;
2625
2626	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2627	if (!rsv) {
2628		ret = -ENOMEM;
2629		goto out;
2630	}
2631	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2632	rsv->failfast = 1;
2633
2634	/*
2635	 * 1 - update the inode
2636	 * 1 - removing the extents in the range
2637	 * 1 - adding the hole extent if no_holes isn't set or if we are
2638	 *     replacing the range with a new extent
2639	 */
2640	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2641		rsv_count = 3;
2642	else
2643		rsv_count = 2;
2644
2645	trans = btrfs_start_transaction(root, rsv_count);
2646	if (IS_ERR(trans)) {
2647		ret = PTR_ERR(trans);
2648		trans = NULL;
2649		goto out_free;
2650	}
2651
2652	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2653				      min_size, false);
2654	BUG_ON(ret);
2655	trans->block_rsv = rsv;
2656
2657	cur_offset = start;
2658	while (cur_offset < end) {
2659		ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
2660					   cur_offset, end + 1, &drop_end,
2661					   1, 0, 0, NULL);
2662		if (ret != -ENOSPC) {
2663			/*
2664			 * The only time we don't want to abort is if we are
2665			 * attempting to clone a partial inline extent, in which
2666			 * case we'll get EOPNOTSUPP.  However if we aren't
2667			 * clone we need to abort no matter what, because if we
2668			 * got EOPNOTSUPP via prealloc then we messed up and
2669			 * need to abort.
2670			 */
2671			if (ret &&
2672			    (ret != -EOPNOTSUPP ||
2673			     (extent_info && extent_info->is_new_extent)))
2674				btrfs_abort_transaction(trans, ret);
2675			break;
2676		}
2677
2678		trans->block_rsv = &fs_info->trans_block_rsv;
2679
2680		if (!extent_info && cur_offset < drop_end &&
2681		    cur_offset < ino_size) {
2682			ret = fill_holes(trans, BTRFS_I(inode), path,
2683					cur_offset, drop_end);
2684			if (ret) {
2685				/*
2686				 * If we failed then we didn't insert our hole
2687				 * entries for the area we dropped, so now the
2688				 * fs is corrupted, so we must abort the
2689				 * transaction.
2690				 */
2691				btrfs_abort_transaction(trans, ret);
2692				break;
2693			}
2694		} else if (!extent_info && cur_offset < drop_end) {
2695			/*
2696			 * We are past the i_size here, but since we didn't
2697			 * insert holes we need to clear the mapped area so we
2698			 * know to not set disk_i_size in this area until a new
2699			 * file extent is inserted here.
2700			 */
2701			ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
2702					cur_offset, drop_end - cur_offset);
2703			if (ret) {
2704				/*
2705				 * We couldn't clear our area, so we could
2706				 * presumably adjust up and corrupt the fs, so
2707				 * we need to abort.
2708				 */
2709				btrfs_abort_transaction(trans, ret);
2710				break;
2711			}
2712		}
2713
2714		if (extent_info && drop_end > extent_info->file_offset) {
2715			u64 replace_len = drop_end - extent_info->file_offset;
2716
2717			ret = btrfs_insert_replace_extent(trans, inode, path,
2718							extent_info, replace_len);
2719			if (ret) {
2720				btrfs_abort_transaction(trans, ret);
2721				break;
2722			}
2723			extent_info->data_len -= replace_len;
2724			extent_info->data_offset += replace_len;
2725			extent_info->file_offset += replace_len;
2726		}
2727
2728		cur_offset = drop_end;
2729
2730		ret = btrfs_update_inode(trans, root, inode);
2731		if (ret)
2732			break;
2733
2734		btrfs_end_transaction(trans);
2735		btrfs_btree_balance_dirty(fs_info);
2736
2737		trans = btrfs_start_transaction(root, rsv_count);
2738		if (IS_ERR(trans)) {
2739			ret = PTR_ERR(trans);
2740			trans = NULL;
2741			break;
2742		}
2743
2744		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2745					      rsv, min_size, false);
2746		BUG_ON(ret);	/* shouldn't happen */
2747		trans->block_rsv = rsv;
2748
2749		if (!extent_info) {
2750			ret = find_first_non_hole(inode, &cur_offset, &len);
2751			if (unlikely(ret < 0))
2752				break;
2753			if (ret && !len) {
2754				ret = 0;
2755				break;
2756			}
2757		}
2758	}
2759
2760	/*
2761	 * If we were cloning, force the next fsync to be a full one since we
2762	 * we replaced (or just dropped in the case of cloning holes when
2763	 * NO_HOLES is enabled) extents and extent maps.
2764	 * This is for the sake of simplicity, and cloning into files larger
2765	 * than 16Mb would force the full fsync any way (when
2766	 * try_release_extent_mapping() is invoked during page cache truncation.
2767	 */
2768	if (extent_info && !extent_info->is_new_extent)
2769		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2770			&BTRFS_I(inode)->runtime_flags);
2771
2772	if (ret)
2773		goto out_trans;
2774
2775	trans->block_rsv = &fs_info->trans_block_rsv;
2776	/*
2777	 * If we are using the NO_HOLES feature we might have had already an
2778	 * hole that overlaps a part of the region [lockstart, lockend] and
2779	 * ends at (or beyond) lockend. Since we have no file extent items to
2780	 * represent holes, drop_end can be less than lockend and so we must
2781	 * make sure we have an extent map representing the existing hole (the
2782	 * call to __btrfs_drop_extents() might have dropped the existing extent
2783	 * map representing the existing hole), otherwise the fast fsync path
2784	 * will not record the existence of the hole region
2785	 * [existing_hole_start, lockend].
2786	 */
2787	if (drop_end <= end)
2788		drop_end = end + 1;
2789	/*
2790	 * Don't insert file hole extent item if it's for a range beyond eof
2791	 * (because it's useless) or if it represents a 0 bytes range (when
2792	 * cur_offset == drop_end).
2793	 */
2794	if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
2795		ret = fill_holes(trans, BTRFS_I(inode), path,
2796				cur_offset, drop_end);
2797		if (ret) {
2798			/* Same comment as above. */
2799			btrfs_abort_transaction(trans, ret);
2800			goto out_trans;
2801		}
2802	} else if (!extent_info && cur_offset < drop_end) {
2803		/* See the comment in the loop above for the reasoning here. */
2804		ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
2805					cur_offset, drop_end - cur_offset);
2806		if (ret) {
2807			btrfs_abort_transaction(trans, ret);
2808			goto out_trans;
2809		}
2810
2811	}
2812	if (extent_info) {
2813		ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
2814						extent_info->data_len);
2815		if (ret) {
2816			btrfs_abort_transaction(trans, ret);
2817			goto out_trans;
2818		}
2819	}
2820
2821out_trans:
2822	if (!trans)
2823		goto out_free;
2824
2825	trans->block_rsv = &fs_info->trans_block_rsv;
2826	if (ret)
2827		btrfs_end_transaction(trans);
2828	else
2829		*trans_out = trans;
2830out_free:
2831	btrfs_free_block_rsv(fs_info, rsv);
2832out:
2833	return ret;
2834}
2835
2836static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2837{
2838	struct inode *inode = file_inode(file);
2839	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2840	struct btrfs_root *root = BTRFS_I(inode)->root;
2841	struct extent_state *cached_state = NULL;
2842	struct btrfs_path *path;
2843	struct btrfs_trans_handle *trans = NULL;
2844	u64 lockstart;
2845	u64 lockend;
2846	u64 tail_start;
2847	u64 tail_len;
2848	u64 orig_start = offset;
2849	int ret = 0;
2850	bool same_block;
2851	u64 ino_size;
2852	bool truncated_block = false;
2853	bool updated_inode = false;
2854
2855	ret = btrfs_wait_ordered_range(inode, offset, len);
2856	if (ret)
2857		return ret;
2858
2859	inode_lock(inode);
2860	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2861	ret = find_first_non_hole(inode, &offset, &len);
2862	if (ret < 0)
2863		goto out_only_mutex;
2864	if (ret && !len) {
2865		/* Already in a large hole */
2866		ret = 0;
2867		goto out_only_mutex;
2868	}
2869
2870	ret = file_modified(file);
2871	if (ret)
2872		goto out_only_mutex;
2873
2874	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
2875	lockend = round_down(offset + len,
2876			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
2877	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2878		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2879	/*
2880	 * We needn't truncate any block which is beyond the end of the file
2881	 * because we are sure there is no data there.
2882	 */
2883	/*
2884	 * Only do this if we are in the same block and we aren't doing the
2885	 * entire block.
2886	 */
2887	if (same_block && len < fs_info->sectorsize) {
2888		if (offset < ino_size) {
2889			truncated_block = true;
2890			ret = btrfs_truncate_block(inode, offset, len, 0);
2891		} else {
2892			ret = 0;
2893		}
2894		goto out_only_mutex;
2895	}
2896
2897	/* zero back part of the first block */
2898	if (offset < ino_size) {
2899		truncated_block = true;
2900		ret = btrfs_truncate_block(inode, offset, 0, 0);
2901		if (ret) {
2902			inode_unlock(inode);
2903			return ret;
2904		}
2905	}
2906
2907	/* Check the aligned pages after the first unaligned page,
2908	 * if offset != orig_start, which means the first unaligned page
2909	 * including several following pages are already in holes,
2910	 * the extra check can be skipped */
2911	if (offset == orig_start) {
2912		/* after truncate page, check hole again */
2913		len = offset + len - lockstart;
2914		offset = lockstart;
2915		ret = find_first_non_hole(inode, &offset, &len);
2916		if (ret < 0)
2917			goto out_only_mutex;
2918		if (ret && !len) {
2919			ret = 0;
2920			goto out_only_mutex;
2921		}
2922		lockstart = offset;
2923	}
2924
2925	/* Check the tail unaligned part is in a hole */
2926	tail_start = lockend + 1;
2927	tail_len = offset + len - tail_start;
2928	if (tail_len) {
2929		ret = find_first_non_hole(inode, &tail_start, &tail_len);
2930		if (unlikely(ret < 0))
2931			goto out_only_mutex;
2932		if (!ret) {
2933			/* zero the front end of the last page */
2934			if (tail_start + tail_len < ino_size) {
2935				truncated_block = true;
2936				ret = btrfs_truncate_block(inode,
2937							tail_start + tail_len,
2938							0, 1);
2939				if (ret)
2940					goto out_only_mutex;
2941			}
2942		}
2943	}
2944
2945	if (lockend < lockstart) {
2946		ret = 0;
2947		goto out_only_mutex;
2948	}
2949
2950	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2951					  &cached_state);
2952	if (ret)
2953		goto out_only_mutex;
2954
2955	path = btrfs_alloc_path();
2956	if (!path) {
2957		ret = -ENOMEM;
2958		goto out;
2959	}
2960
2961	ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
2962				     &trans);
2963	btrfs_free_path(path);
2964	if (ret)
2965		goto out;
2966
2967	ASSERT(trans != NULL);
2968	inode_inc_iversion(inode);
2969	inode->i_mtime = inode->i_ctime = current_time(inode);
2970	ret = btrfs_update_inode(trans, root, inode);
2971	updated_inode = true;
2972	btrfs_end_transaction(trans);
2973	btrfs_btree_balance_dirty(fs_info);
2974out:
2975	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2976			     &cached_state);
2977out_only_mutex:
2978	if (!updated_inode && truncated_block && !ret) {
2979		/*
2980		 * If we only end up zeroing part of a page, we still need to
2981		 * update the inode item, so that all the time fields are
2982		 * updated as well as the necessary btrfs inode in memory fields
2983		 * for detecting, at fsync time, if the inode isn't yet in the
2984		 * log tree or it's there but not up to date.
2985		 */
2986		struct timespec64 now = current_time(inode);
2987
2988		inode_inc_iversion(inode);
2989		inode->i_mtime = now;
2990		inode->i_ctime = now;
2991		trans = btrfs_start_transaction(root, 1);
2992		if (IS_ERR(trans)) {
2993			ret = PTR_ERR(trans);
2994		} else {
2995			int ret2;
2996
2997			ret = btrfs_update_inode(trans, root, inode);
2998			ret2 = btrfs_end_transaction(trans);
2999			if (!ret)
3000				ret = ret2;
3001		}
3002	}
3003	inode_unlock(inode);
3004	return ret;
3005}
3006
3007/* Helper structure to record which range is already reserved */
3008struct falloc_range {
3009	struct list_head list;
3010	u64 start;
3011	u64 len;
3012};
3013
3014/*
3015 * Helper function to add falloc range
3016 *
3017 * Caller should have locked the larger range of extent containing
3018 * [start, len)
3019 */
3020static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3021{
3022	struct falloc_range *prev = NULL;
3023	struct falloc_range *range = NULL;
3024
3025	if (list_empty(head))
3026		goto insert;
3027
3028	/*
3029	 * As fallocate iterate by bytenr order, we only need to check
3030	 * the last range.
3031	 */
3032	prev = list_entry(head->prev, struct falloc_range, list);
3033	if (prev->start + prev->len == start) {
3034		prev->len += len;
3035		return 0;
3036	}
3037insert:
3038	range = kmalloc(sizeof(*range), GFP_KERNEL);
3039	if (!range)
3040		return -ENOMEM;
3041	range->start = start;
3042	range->len = len;
3043	list_add_tail(&range->list, head);
3044	return 0;
3045}
3046
3047static int btrfs_fallocate_update_isize(struct inode *inode,
3048					const u64 end,
3049					const int mode)
3050{
3051	struct btrfs_trans_handle *trans;
3052	struct btrfs_root *root = BTRFS_I(inode)->root;
3053	int ret;
3054	int ret2;
3055
3056	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3057		return 0;
3058
3059	trans = btrfs_start_transaction(root, 1);
3060	if (IS_ERR(trans))
3061		return PTR_ERR(trans);
3062
3063	inode->i_ctime = current_time(inode);
3064	i_size_write(inode, end);
3065	btrfs_inode_safe_disk_i_size_write(inode, 0);
3066	ret = btrfs_update_inode(trans, root, inode);
3067	ret2 = btrfs_end_transaction(trans);
3068
3069	return ret ? ret : ret2;
3070}
3071
3072enum {
3073	RANGE_BOUNDARY_WRITTEN_EXTENT,
3074	RANGE_BOUNDARY_PREALLOC_EXTENT,
3075	RANGE_BOUNDARY_HOLE,
3076};
3077
3078static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3079						 u64 offset)
3080{
3081	const u64 sectorsize = btrfs_inode_sectorsize(inode);
3082	struct extent_map *em;
3083	int ret;
3084
3085	offset = round_down(offset, sectorsize);
3086	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3087	if (IS_ERR(em))
3088		return PTR_ERR(em);
3089
3090	if (em->block_start == EXTENT_MAP_HOLE)
3091		ret = RANGE_BOUNDARY_HOLE;
3092	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3093		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3094	else
3095		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3096
3097	free_extent_map(em);
3098	return ret;
3099}
3100
3101static int btrfs_zero_range(struct inode *inode,
3102			    loff_t offset,
3103			    loff_t len,
3104			    const int mode)
3105{
3106	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3107	struct extent_map *em;
3108	struct extent_changeset *data_reserved = NULL;
3109	int ret;
3110	u64 alloc_hint = 0;
3111	const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3112	u64 alloc_start = round_down(offset, sectorsize);
3113	u64 alloc_end = round_up(offset + len, sectorsize);
3114	u64 bytes_to_reserve = 0;
3115	bool space_reserved = false;
3116
3117	inode_dio_wait(inode);
3118
3119	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3120			      alloc_end - alloc_start);
3121	if (IS_ERR(em)) {
3122		ret = PTR_ERR(em);
3123		goto out;
3124	}
3125
3126	/*
3127	 * Avoid hole punching and extent allocation for some cases. More cases
3128	 * could be considered, but these are unlikely common and we keep things
3129	 * as simple as possible for now. Also, intentionally, if the target
3130	 * range contains one or more prealloc extents together with regular
3131	 * extents and holes, we drop all the existing extents and allocate a
3132	 * new prealloc extent, so that we get a larger contiguous disk extent.
3133	 */
3134	if (em->start <= alloc_start &&
3135	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3136		const u64 em_end = em->start + em->len;
3137
3138		if (em_end >= offset + len) {
3139			/*
3140			 * The whole range is already a prealloc extent,
3141			 * do nothing except updating the inode's i_size if
3142			 * needed.
3143			 */
3144			free_extent_map(em);
3145			ret = btrfs_fallocate_update_isize(inode, offset + len,
3146							   mode);
3147			goto out;
3148		}
3149		/*
3150		 * Part of the range is already a prealloc extent, so operate
3151		 * only on the remaining part of the range.
3152		 */
3153		alloc_start = em_end;
3154		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3155		len = offset + len - alloc_start;
3156		offset = alloc_start;
3157		alloc_hint = em->block_start + em->len;
3158	}
3159	free_extent_map(em);
3160
3161	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3162	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3163		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3164				      sectorsize);
3165		if (IS_ERR(em)) {
3166			ret = PTR_ERR(em);
3167			goto out;
3168		}
3169
3170		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3171			free_extent_map(em);
3172			ret = btrfs_fallocate_update_isize(inode, offset + len,
3173							   mode);
3174			goto out;
3175		}
3176		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3177			free_extent_map(em);
3178			ret = btrfs_truncate_block(inode, offset, len, 0);
3179			if (!ret)
3180				ret = btrfs_fallocate_update_isize(inode,
3181								   offset + len,
3182								   mode);
3183			return ret;
3184		}
3185		free_extent_map(em);
3186		alloc_start = round_down(offset, sectorsize);
3187		alloc_end = alloc_start + sectorsize;
3188		goto reserve_space;
3189	}
3190
3191	alloc_start = round_up(offset, sectorsize);
3192	alloc_end = round_down(offset + len, sectorsize);
3193
3194	/*
3195	 * For unaligned ranges, check the pages at the boundaries, they might
3196	 * map to an extent, in which case we need to partially zero them, or
3197	 * they might map to a hole, in which case we need our allocation range
3198	 * to cover them.
3199	 */
3200	if (!IS_ALIGNED(offset, sectorsize)) {
3201		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3202							    offset);
3203		if (ret < 0)
3204			goto out;
3205		if (ret == RANGE_BOUNDARY_HOLE) {
3206			alloc_start = round_down(offset, sectorsize);
3207			ret = 0;
3208		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3209			ret = btrfs_truncate_block(inode, offset, 0, 0);
3210			if (ret)
3211				goto out;
3212		} else {
3213			ret = 0;
3214		}
3215	}
3216
3217	if (!IS_ALIGNED(offset + len, sectorsize)) {
3218		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3219							    offset + len);
3220		if (ret < 0)
3221			goto out;
3222		if (ret == RANGE_BOUNDARY_HOLE) {
3223			alloc_end = round_up(offset + len, sectorsize);
3224			ret = 0;
3225		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3226			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
3227			if (ret)
3228				goto out;
3229		} else {
3230			ret = 0;
3231		}
3232	}
3233
3234reserve_space:
3235	if (alloc_start < alloc_end) {
3236		struct extent_state *cached_state = NULL;
3237		const u64 lockstart = alloc_start;
3238		const u64 lockend = alloc_end - 1;
3239
3240		bytes_to_reserve = alloc_end - alloc_start;
3241		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3242						      bytes_to_reserve);
3243		if (ret < 0)
3244			goto out;
3245		space_reserved = true;
3246		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3247						  &cached_state);
3248		if (ret)
3249			goto out;
3250		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3251						alloc_start, bytes_to_reserve);
3252		if (ret) {
3253			unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3254					     lockend, &cached_state);
3255			goto out;
3256		}
3257		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3258						alloc_end - alloc_start,
3259						i_blocksize(inode),
3260						offset + len, &alloc_hint);
3261		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3262				     lockend, &cached_state);
3263		/* btrfs_prealloc_file_range releases reserved space on error */
3264		if (ret) {
3265			space_reserved = false;
3266			goto out;
3267		}
3268	}
3269	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3270 out:
3271	if (ret && space_reserved)
3272		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3273					       alloc_start, bytes_to_reserve);
3274	extent_changeset_free(data_reserved);
3275
3276	return ret;
3277}
3278
3279static long btrfs_fallocate(struct file *file, int mode,
3280			    loff_t offset, loff_t len)
3281{
3282	struct inode *inode = file_inode(file);
3283	struct extent_state *cached_state = NULL;
3284	struct extent_changeset *data_reserved = NULL;
3285	struct falloc_range *range;
3286	struct falloc_range *tmp;
3287	struct list_head reserve_list;
3288	u64 cur_offset;
3289	u64 last_byte;
3290	u64 alloc_start;
3291	u64 alloc_end;
3292	u64 alloc_hint = 0;
3293	u64 locked_end;
3294	u64 actual_end = 0;
3295	struct extent_map *em;
3296	int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3297	int ret;
3298
3299	alloc_start = round_down(offset, blocksize);
3300	alloc_end = round_up(offset + len, blocksize);
3301	cur_offset = alloc_start;
3302
3303	/* Make sure we aren't being give some crap mode */
3304	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3305		     FALLOC_FL_ZERO_RANGE))
3306		return -EOPNOTSUPP;
3307
3308	if (mode & FALLOC_FL_PUNCH_HOLE)
3309		return btrfs_punch_hole(file, offset, len);
3310
3311	/*
3312	 * Only trigger disk allocation, don't trigger qgroup reserve
3313	 *
3314	 * For qgroup space, it will be checked later.
3315	 */
3316	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
3317		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3318						      alloc_end - alloc_start);
3319		if (ret < 0)
3320			return ret;
3321	}
3322
3323	inode_lock(inode);
3324
3325	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3326		ret = inode_newsize_ok(inode, offset + len);
3327		if (ret)
3328			goto out;
3329	}
3330
3331	ret = file_modified(file);
3332	if (ret)
3333		goto out;
3334
3335	/*
3336	 * TODO: Move these two operations after we have checked
3337	 * accurate reserved space, or fallocate can still fail but
3338	 * with page truncated or size expanded.
3339	 *
3340	 * But that's a minor problem and won't do much harm BTW.
3341	 */
3342	if (alloc_start > inode->i_size) {
3343		ret = btrfs_cont_expand(inode, i_size_read(inode),
3344					alloc_start);
3345		if (ret)
3346			goto out;
3347	} else if (offset + len > inode->i_size) {
3348		/*
3349		 * If we are fallocating from the end of the file onward we
3350		 * need to zero out the end of the block if i_size lands in the
3351		 * middle of a block.
3352		 */
3353		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
3354		if (ret)
3355			goto out;
3356	}
3357
3358	/*
3359	 * wait for ordered IO before we have any locks.  We'll loop again
3360	 * below with the locks held.
3361	 */
3362	ret = btrfs_wait_ordered_range(inode, alloc_start,
3363				       alloc_end - alloc_start);
3364	if (ret)
3365		goto out;
3366
3367	if (mode & FALLOC_FL_ZERO_RANGE) {
3368		ret = btrfs_zero_range(inode, offset, len, mode);
3369		inode_unlock(inode);
3370		return ret;
3371	}
3372
3373	locked_end = alloc_end - 1;
3374	while (1) {
3375		struct btrfs_ordered_extent *ordered;
3376
3377		/* the extent lock is ordered inside the running
3378		 * transaction
3379		 */
3380		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3381				 locked_end, &cached_state);
3382		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3383							    locked_end);
3384
3385		if (ordered &&
3386		    ordered->file_offset + ordered->num_bytes > alloc_start &&
3387		    ordered->file_offset < alloc_end) {
3388			btrfs_put_ordered_extent(ordered);
3389			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
3390					     alloc_start, locked_end,
3391					     &cached_state);
3392			/*
3393			 * we can't wait on the range with the transaction
3394			 * running or with the extent lock held
3395			 */
3396			ret = btrfs_wait_ordered_range(inode, alloc_start,
3397						       alloc_end - alloc_start);
3398			if (ret)
3399				goto out;
3400		} else {
3401			if (ordered)
3402				btrfs_put_ordered_extent(ordered);
3403			break;
3404		}
3405	}
3406
3407	/* First, check if we exceed the qgroup limit */
3408	INIT_LIST_HEAD(&reserve_list);
3409	while (cur_offset < alloc_end) {
3410		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3411				      alloc_end - cur_offset);
3412		if (IS_ERR(em)) {
3413			ret = PTR_ERR(em);
3414			break;
3415		}
3416		last_byte = min(extent_map_end(em), alloc_end);
3417		actual_end = min_t(u64, extent_map_end(em), offset + len);
3418		last_byte = ALIGN(last_byte, blocksize);
3419		if (em->block_start == EXTENT_MAP_HOLE ||
3420		    (cur_offset >= inode->i_size &&
3421		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3422			ret = add_falloc_range(&reserve_list, cur_offset,
3423					       last_byte - cur_offset);
3424			if (ret < 0) {
3425				free_extent_map(em);
3426				break;
3427			}
3428			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3429					&data_reserved, cur_offset,
3430					last_byte - cur_offset);
3431			if (ret < 0) {
3432				cur_offset = last_byte;
3433				free_extent_map(em);
3434				break;
3435			}
3436		} else {
3437			/*
3438			 * Do not need to reserve unwritten extent for this
3439			 * range, free reserved data space first, otherwise
3440			 * it'll result in false ENOSPC error.
3441			 */
3442			btrfs_free_reserved_data_space(BTRFS_I(inode),
3443				data_reserved, cur_offset,
3444				last_byte - cur_offset);
3445		}
3446		free_extent_map(em);
3447		cur_offset = last_byte;
3448	}
3449
3450	/*
3451	 * If ret is still 0, means we're OK to fallocate.
3452	 * Or just cleanup the list and exit.
3453	 */
3454	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3455		if (!ret)
3456			ret = btrfs_prealloc_file_range(inode, mode,
3457					range->start,
3458					range->len, i_blocksize(inode),
3459					offset + len, &alloc_hint);
3460		else
3461			btrfs_free_reserved_data_space(BTRFS_I(inode),
3462					data_reserved, range->start,
3463					range->len);
3464		list_del(&range->list);
3465		kfree(range);
3466	}
3467	if (ret < 0)
3468		goto out_unlock;
3469
3470	/*
3471	 * We didn't need to allocate any more space, but we still extended the
3472	 * size of the file so we need to update i_size and the inode item.
3473	 */
3474	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3475out_unlock:
3476	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3477			     &cached_state);
3478out:
3479	inode_unlock(inode);
3480	/* Let go of our reservation. */
3481	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3482		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3483				cur_offset, alloc_end - cur_offset);
3484	extent_changeset_free(data_reserved);
3485	return ret;
3486}
3487
3488static loff_t find_desired_extent(struct inode *inode, loff_t offset,
3489				  int whence)
3490{
3491	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3492	struct extent_map *em = NULL;
3493	struct extent_state *cached_state = NULL;
3494	loff_t i_size = inode->i_size;
3495	u64 lockstart;
3496	u64 lockend;
3497	u64 start;
3498	u64 len;
3499	int ret = 0;
3500
3501	if (i_size == 0 || offset >= i_size)
3502		return -ENXIO;
3503
3504	/*
3505	 * offset can be negative, in this case we start finding DATA/HOLE from
3506	 * the very start of the file.
3507	 */
3508	start = max_t(loff_t, 0, offset);
3509
3510	lockstart = round_down(start, fs_info->sectorsize);
3511	lockend = round_up(i_size, fs_info->sectorsize);
3512	if (lockend <= lockstart)
3513		lockend = lockstart + fs_info->sectorsize;
3514	lockend--;
3515	len = lockend - lockstart + 1;
3516
3517	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3518			 &cached_state);
3519
3520	while (start < i_size) {
3521		em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
3522		if (IS_ERR(em)) {
3523			ret = PTR_ERR(em);
3524			em = NULL;
3525			break;
3526		}
3527
3528		if (whence == SEEK_HOLE &&
3529		    (em->block_start == EXTENT_MAP_HOLE ||
3530		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3531			break;
3532		else if (whence == SEEK_DATA &&
3533			   (em->block_start != EXTENT_MAP_HOLE &&
3534			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3535			break;
3536
3537		start = em->start + em->len;
3538		free_extent_map(em);
3539		em = NULL;
3540		cond_resched();
3541	}
3542	free_extent_map(em);
3543	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3544			     &cached_state);
3545	if (ret) {
3546		offset = ret;
3547	} else {
3548		if (whence == SEEK_DATA && start >= i_size)
3549			offset = -ENXIO;
3550		else
3551			offset = min_t(loff_t, start, i_size);
3552	}
3553
3554	return offset;
3555}
3556
3557static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3558{
3559	struct inode *inode = file->f_mapping->host;
3560
3561	switch (whence) {
3562	default:
3563		return generic_file_llseek(file, offset, whence);
3564	case SEEK_DATA:
3565	case SEEK_HOLE:
3566		inode_lock_shared(inode);
3567		offset = find_desired_extent(inode, offset, whence);
3568		inode_unlock_shared(inode);
3569		break;
3570	}
3571
3572	if (offset < 0)
3573		return offset;
3574
3575	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3576}
3577
3578static int btrfs_file_open(struct inode *inode, struct file *filp)
3579{
3580	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3581	return generic_file_open(inode, filp);
3582}
3583
3584static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3585{
3586	ssize_t ret = 0;
3587
3588	if (iocb->ki_flags & IOCB_DIRECT) {
3589		struct inode *inode = file_inode(iocb->ki_filp);
3590
3591		inode_lock_shared(inode);
3592		ret = btrfs_direct_IO(iocb, to);
3593		inode_unlock_shared(inode);
3594		if (ret < 0 || !iov_iter_count(to) ||
3595		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3596			return ret;
3597	}
3598
3599	return generic_file_buffered_read(iocb, to, ret);
3600}
3601
3602const struct file_operations btrfs_file_operations = {
3603	.llseek		= btrfs_file_llseek,
3604	.read_iter      = btrfs_file_read_iter,
3605	.splice_read	= generic_file_splice_read,
3606	.write_iter	= btrfs_file_write_iter,
3607	.splice_write	= iter_file_splice_write,
3608	.mmap		= btrfs_file_mmap,
3609	.open		= btrfs_file_open,
3610	.release	= btrfs_release_file,
3611	.fsync		= btrfs_sync_file,
3612	.fallocate	= btrfs_fallocate,
3613	.unlocked_ioctl	= btrfs_ioctl,
3614#ifdef CONFIG_COMPAT
3615	.compat_ioctl	= btrfs_compat_ioctl,
3616#endif
3617	.remap_file_range = btrfs_remap_file_range,
3618};
3619
3620void __cold btrfs_auto_defrag_exit(void)
3621{
3622	kmem_cache_destroy(btrfs_inode_defrag_cachep);
3623}
3624
3625int __init btrfs_auto_defrag_init(void)
3626{
3627	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3628					sizeof(struct inode_defrag), 0,
3629					SLAB_MEM_SPREAD,
3630					NULL);
3631	if (!btrfs_inode_defrag_cachep)
3632		return -ENOMEM;
3633
3634	return 0;
3635}
3636
3637int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3638{
3639	int ret;
3640
3641	/*
3642	 * So with compression we will find and lock a dirty page and clear the
3643	 * first one as dirty, setup an async extent, and immediately return
3644	 * with the entire range locked but with nobody actually marked with
3645	 * writeback.  So we can't just filemap_write_and_wait_range() and
3646	 * expect it to work since it will just kick off a thread to do the
3647	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3648	 * since it will wait on the page lock, which won't be unlocked until
3649	 * after the pages have been marked as writeback and so we're good to go
3650	 * from there.  We have to do this otherwise we'll miss the ordered
3651	 * extents and that results in badness.  Please Josef, do not think you
3652	 * know better and pull this out at some point in the future, it is
3653	 * right and you are wrong.
3654	 */
3655	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3656	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3657			     &BTRFS_I(inode)->runtime_flags))
3658		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3659
3660	return ret;
3661}
3662