18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (C) 2007 Oracle.  All rights reserved.
48c2ecf20Sopenharmony_ci */
58c2ecf20Sopenharmony_ci
68c2ecf20Sopenharmony_ci#include <linux/slab.h>
78c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
88c2ecf20Sopenharmony_ci#include <linux/writeback.h>
98c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
108c2ecf20Sopenharmony_ci#include "misc.h"
118c2ecf20Sopenharmony_ci#include "ctree.h"
128c2ecf20Sopenharmony_ci#include "transaction.h"
138c2ecf20Sopenharmony_ci#include "btrfs_inode.h"
148c2ecf20Sopenharmony_ci#include "extent_io.h"
158c2ecf20Sopenharmony_ci#include "disk-io.h"
168c2ecf20Sopenharmony_ci#include "compression.h"
178c2ecf20Sopenharmony_ci#include "delalloc-space.h"
188c2ecf20Sopenharmony_ci#include "qgroup.h"
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_cistatic struct kmem_cache *btrfs_ordered_extent_cache;
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_cistatic u64 entry_end(struct btrfs_ordered_extent *entry)
238c2ecf20Sopenharmony_ci{
248c2ecf20Sopenharmony_ci	if (entry->file_offset + entry->num_bytes < entry->file_offset)
258c2ecf20Sopenharmony_ci		return (u64)-1;
268c2ecf20Sopenharmony_ci	return entry->file_offset + entry->num_bytes;
278c2ecf20Sopenharmony_ci}
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci/* returns NULL if the insertion worked, or it returns the node it did find
308c2ecf20Sopenharmony_ci * in the tree
318c2ecf20Sopenharmony_ci */
328c2ecf20Sopenharmony_cistatic struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
338c2ecf20Sopenharmony_ci				   struct rb_node *node)
348c2ecf20Sopenharmony_ci{
358c2ecf20Sopenharmony_ci	struct rb_node **p = &root->rb_node;
368c2ecf20Sopenharmony_ci	struct rb_node *parent = NULL;
378c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry;
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci	while (*p) {
408c2ecf20Sopenharmony_ci		parent = *p;
418c2ecf20Sopenharmony_ci		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci		if (file_offset < entry->file_offset)
448c2ecf20Sopenharmony_ci			p = &(*p)->rb_left;
458c2ecf20Sopenharmony_ci		else if (file_offset >= entry_end(entry))
468c2ecf20Sopenharmony_ci			p = &(*p)->rb_right;
478c2ecf20Sopenharmony_ci		else
488c2ecf20Sopenharmony_ci			return parent;
498c2ecf20Sopenharmony_ci	}
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_ci	rb_link_node(node, parent, p);
528c2ecf20Sopenharmony_ci	rb_insert_color(node, root);
538c2ecf20Sopenharmony_ci	return NULL;
548c2ecf20Sopenharmony_ci}
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci/*
578c2ecf20Sopenharmony_ci * look for a given offset in the tree, and if it can't be found return the
588c2ecf20Sopenharmony_ci * first lesser offset
598c2ecf20Sopenharmony_ci */
608c2ecf20Sopenharmony_cistatic struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
618c2ecf20Sopenharmony_ci				     struct rb_node **prev_ret)
628c2ecf20Sopenharmony_ci{
638c2ecf20Sopenharmony_ci	struct rb_node *n = root->rb_node;
648c2ecf20Sopenharmony_ci	struct rb_node *prev = NULL;
658c2ecf20Sopenharmony_ci	struct rb_node *test;
668c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry;
678c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *prev_entry = NULL;
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci	while (n) {
708c2ecf20Sopenharmony_ci		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
718c2ecf20Sopenharmony_ci		prev = n;
728c2ecf20Sopenharmony_ci		prev_entry = entry;
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci		if (file_offset < entry->file_offset)
758c2ecf20Sopenharmony_ci			n = n->rb_left;
768c2ecf20Sopenharmony_ci		else if (file_offset >= entry_end(entry))
778c2ecf20Sopenharmony_ci			n = n->rb_right;
788c2ecf20Sopenharmony_ci		else
798c2ecf20Sopenharmony_ci			return n;
808c2ecf20Sopenharmony_ci	}
818c2ecf20Sopenharmony_ci	if (!prev_ret)
828c2ecf20Sopenharmony_ci		return NULL;
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci	while (prev && file_offset >= entry_end(prev_entry)) {
858c2ecf20Sopenharmony_ci		test = rb_next(prev);
868c2ecf20Sopenharmony_ci		if (!test)
878c2ecf20Sopenharmony_ci			break;
888c2ecf20Sopenharmony_ci		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
898c2ecf20Sopenharmony_ci				      rb_node);
908c2ecf20Sopenharmony_ci		if (file_offset < entry_end(prev_entry))
918c2ecf20Sopenharmony_ci			break;
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci		prev = test;
948c2ecf20Sopenharmony_ci	}
958c2ecf20Sopenharmony_ci	if (prev)
968c2ecf20Sopenharmony_ci		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
978c2ecf20Sopenharmony_ci				      rb_node);
988c2ecf20Sopenharmony_ci	while (prev && file_offset < entry_end(prev_entry)) {
998c2ecf20Sopenharmony_ci		test = rb_prev(prev);
1008c2ecf20Sopenharmony_ci		if (!test)
1018c2ecf20Sopenharmony_ci			break;
1028c2ecf20Sopenharmony_ci		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
1038c2ecf20Sopenharmony_ci				      rb_node);
1048c2ecf20Sopenharmony_ci		prev = test;
1058c2ecf20Sopenharmony_ci	}
1068c2ecf20Sopenharmony_ci	*prev_ret = prev;
1078c2ecf20Sopenharmony_ci	return NULL;
1088c2ecf20Sopenharmony_ci}
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci/*
1118c2ecf20Sopenharmony_ci * helper to check if a given offset is inside a given entry
1128c2ecf20Sopenharmony_ci */
1138c2ecf20Sopenharmony_cistatic int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
1148c2ecf20Sopenharmony_ci{
1158c2ecf20Sopenharmony_ci	if (file_offset < entry->file_offset ||
1168c2ecf20Sopenharmony_ci	    entry->file_offset + entry->num_bytes <= file_offset)
1178c2ecf20Sopenharmony_ci		return 0;
1188c2ecf20Sopenharmony_ci	return 1;
1198c2ecf20Sopenharmony_ci}
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_cistatic int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
1228c2ecf20Sopenharmony_ci			  u64 len)
1238c2ecf20Sopenharmony_ci{
1248c2ecf20Sopenharmony_ci	if (file_offset + len <= entry->file_offset ||
1258c2ecf20Sopenharmony_ci	    entry->file_offset + entry->num_bytes <= file_offset)
1268c2ecf20Sopenharmony_ci		return 0;
1278c2ecf20Sopenharmony_ci	return 1;
1288c2ecf20Sopenharmony_ci}
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci/*
1318c2ecf20Sopenharmony_ci * look find the first ordered struct that has this offset, otherwise
1328c2ecf20Sopenharmony_ci * the first one less than this offset
1338c2ecf20Sopenharmony_ci */
1348c2ecf20Sopenharmony_cistatic inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
1358c2ecf20Sopenharmony_ci					  u64 file_offset)
1368c2ecf20Sopenharmony_ci{
1378c2ecf20Sopenharmony_ci	struct rb_root *root = &tree->tree;
1388c2ecf20Sopenharmony_ci	struct rb_node *prev = NULL;
1398c2ecf20Sopenharmony_ci	struct rb_node *ret;
1408c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry;
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci	if (tree->last) {
1438c2ecf20Sopenharmony_ci		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
1448c2ecf20Sopenharmony_ci				 rb_node);
1458c2ecf20Sopenharmony_ci		if (offset_in_entry(entry, file_offset))
1468c2ecf20Sopenharmony_ci			return tree->last;
1478c2ecf20Sopenharmony_ci	}
1488c2ecf20Sopenharmony_ci	ret = __tree_search(root, file_offset, &prev);
1498c2ecf20Sopenharmony_ci	if (!ret)
1508c2ecf20Sopenharmony_ci		ret = prev;
1518c2ecf20Sopenharmony_ci	if (ret)
1528c2ecf20Sopenharmony_ci		tree->last = ret;
1538c2ecf20Sopenharmony_ci	return ret;
1548c2ecf20Sopenharmony_ci}
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci/*
1578c2ecf20Sopenharmony_ci * Allocate and add a new ordered_extent into the per-inode tree.
1588c2ecf20Sopenharmony_ci *
1598c2ecf20Sopenharmony_ci * The tree is given a single reference on the ordered extent that was
1608c2ecf20Sopenharmony_ci * inserted.
1618c2ecf20Sopenharmony_ci */
1628c2ecf20Sopenharmony_cistatic int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
1638c2ecf20Sopenharmony_ci				      u64 disk_bytenr, u64 num_bytes,
1648c2ecf20Sopenharmony_ci				      u64 disk_num_bytes, int type, int dio,
1658c2ecf20Sopenharmony_ci				      int compress_type)
1668c2ecf20Sopenharmony_ci{
1678c2ecf20Sopenharmony_ci	struct btrfs_root *root = inode->root;
1688c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = root->fs_info;
1698c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
1708c2ecf20Sopenharmony_ci	struct rb_node *node;
1718c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry;
1728c2ecf20Sopenharmony_ci	int ret;
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
1758c2ecf20Sopenharmony_ci		/* For nocow write, we can release the qgroup rsv right now */
1768c2ecf20Sopenharmony_ci		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
1778c2ecf20Sopenharmony_ci		if (ret < 0)
1788c2ecf20Sopenharmony_ci			return ret;
1798c2ecf20Sopenharmony_ci		ret = 0;
1808c2ecf20Sopenharmony_ci	} else {
1818c2ecf20Sopenharmony_ci		/*
1828c2ecf20Sopenharmony_ci		 * The ordered extent has reserved qgroup space, release now
1838c2ecf20Sopenharmony_ci		 * and pass the reserved number for qgroup_record to free.
1848c2ecf20Sopenharmony_ci		 */
1858c2ecf20Sopenharmony_ci		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
1868c2ecf20Sopenharmony_ci		if (ret < 0)
1878c2ecf20Sopenharmony_ci			return ret;
1888c2ecf20Sopenharmony_ci	}
1898c2ecf20Sopenharmony_ci	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
1908c2ecf20Sopenharmony_ci	if (!entry)
1918c2ecf20Sopenharmony_ci		return -ENOMEM;
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	entry->file_offset = file_offset;
1948c2ecf20Sopenharmony_ci	entry->disk_bytenr = disk_bytenr;
1958c2ecf20Sopenharmony_ci	entry->num_bytes = num_bytes;
1968c2ecf20Sopenharmony_ci	entry->disk_num_bytes = disk_num_bytes;
1978c2ecf20Sopenharmony_ci	entry->bytes_left = num_bytes;
1988c2ecf20Sopenharmony_ci	entry->inode = igrab(&inode->vfs_inode);
1998c2ecf20Sopenharmony_ci	entry->compress_type = compress_type;
2008c2ecf20Sopenharmony_ci	entry->truncated_len = (u64)-1;
2018c2ecf20Sopenharmony_ci	entry->qgroup_rsv = ret;
2028c2ecf20Sopenharmony_ci	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
2038c2ecf20Sopenharmony_ci		set_bit(type, &entry->flags);
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	if (dio) {
2068c2ecf20Sopenharmony_ci		percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
2078c2ecf20Sopenharmony_ci					 fs_info->delalloc_batch);
2088c2ecf20Sopenharmony_ci		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
2098c2ecf20Sopenharmony_ci	}
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	/* one ref for the tree */
2128c2ecf20Sopenharmony_ci	refcount_set(&entry->refs, 1);
2138c2ecf20Sopenharmony_ci	init_waitqueue_head(&entry->wait);
2148c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&entry->list);
2158c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&entry->log_list);
2168c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&entry->root_extent_list);
2178c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&entry->work_list);
2188c2ecf20Sopenharmony_ci	init_completion(&entry->completion);
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci	trace_btrfs_ordered_extent_add(inode, entry);
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
2238c2ecf20Sopenharmony_ci	node = tree_insert(&tree->tree, file_offset,
2248c2ecf20Sopenharmony_ci			   &entry->rb_node);
2258c2ecf20Sopenharmony_ci	if (node)
2268c2ecf20Sopenharmony_ci		btrfs_panic(fs_info, -EEXIST,
2278c2ecf20Sopenharmony_ci				"inconsistency in ordered tree at offset %llu",
2288c2ecf20Sopenharmony_ci				file_offset);
2298c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
2308c2ecf20Sopenharmony_ci
2318c2ecf20Sopenharmony_ci	spin_lock(&root->ordered_extent_lock);
2328c2ecf20Sopenharmony_ci	list_add_tail(&entry->root_extent_list,
2338c2ecf20Sopenharmony_ci		      &root->ordered_extents);
2348c2ecf20Sopenharmony_ci	root->nr_ordered_extents++;
2358c2ecf20Sopenharmony_ci	if (root->nr_ordered_extents == 1) {
2368c2ecf20Sopenharmony_ci		spin_lock(&fs_info->ordered_root_lock);
2378c2ecf20Sopenharmony_ci		BUG_ON(!list_empty(&root->ordered_root));
2388c2ecf20Sopenharmony_ci		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
2398c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->ordered_root_lock);
2408c2ecf20Sopenharmony_ci	}
2418c2ecf20Sopenharmony_ci	spin_unlock(&root->ordered_extent_lock);
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	/*
2448c2ecf20Sopenharmony_ci	 * We don't need the count_max_extents here, we can assume that all of
2458c2ecf20Sopenharmony_ci	 * that work has been done at higher layers, so this is truly the
2468c2ecf20Sopenharmony_ci	 * smallest the extent is going to get.
2478c2ecf20Sopenharmony_ci	 */
2488c2ecf20Sopenharmony_ci	spin_lock(&inode->lock);
2498c2ecf20Sopenharmony_ci	btrfs_mod_outstanding_extents(inode, 1);
2508c2ecf20Sopenharmony_ci	spin_unlock(&inode->lock);
2518c2ecf20Sopenharmony_ci
2528c2ecf20Sopenharmony_ci	return 0;
2538c2ecf20Sopenharmony_ci}
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ciint btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
2568c2ecf20Sopenharmony_ci			     u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
2578c2ecf20Sopenharmony_ci			     int type)
2588c2ecf20Sopenharmony_ci{
2598c2ecf20Sopenharmony_ci	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
2608c2ecf20Sopenharmony_ci					  num_bytes, disk_num_bytes, type, 0,
2618c2ecf20Sopenharmony_ci					  BTRFS_COMPRESS_NONE);
2628c2ecf20Sopenharmony_ci}
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ciint btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
2658c2ecf20Sopenharmony_ci				 u64 disk_bytenr, u64 num_bytes,
2668c2ecf20Sopenharmony_ci				 u64 disk_num_bytes, int type)
2678c2ecf20Sopenharmony_ci{
2688c2ecf20Sopenharmony_ci	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
2698c2ecf20Sopenharmony_ci					  num_bytes, disk_num_bytes, type, 1,
2708c2ecf20Sopenharmony_ci					  BTRFS_COMPRESS_NONE);
2718c2ecf20Sopenharmony_ci}
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ciint btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
2748c2ecf20Sopenharmony_ci				      u64 disk_bytenr, u64 num_bytes,
2758c2ecf20Sopenharmony_ci				      u64 disk_num_bytes, int type,
2768c2ecf20Sopenharmony_ci				      int compress_type)
2778c2ecf20Sopenharmony_ci{
2788c2ecf20Sopenharmony_ci	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
2798c2ecf20Sopenharmony_ci					  num_bytes, disk_num_bytes, type, 0,
2808c2ecf20Sopenharmony_ci					  compress_type);
2818c2ecf20Sopenharmony_ci}
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci/*
2848c2ecf20Sopenharmony_ci * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
2858c2ecf20Sopenharmony_ci * when an ordered extent is finished.  If the list covers more than one
2868c2ecf20Sopenharmony_ci * ordered extent, it is split across multiples.
2878c2ecf20Sopenharmony_ci */
2888c2ecf20Sopenharmony_civoid btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
2898c2ecf20Sopenharmony_ci			   struct btrfs_ordered_sum *sum)
2908c2ecf20Sopenharmony_ci{
2918c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree;
2928c2ecf20Sopenharmony_ci
2938c2ecf20Sopenharmony_ci	tree = &BTRFS_I(entry->inode)->ordered_tree;
2948c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
2958c2ecf20Sopenharmony_ci	list_add_tail(&sum->list, &entry->list);
2968c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
2978c2ecf20Sopenharmony_ci}
2988c2ecf20Sopenharmony_ci
2998c2ecf20Sopenharmony_ci/*
3008c2ecf20Sopenharmony_ci * this is used to account for finished IO across a given range
3018c2ecf20Sopenharmony_ci * of the file.  The IO may span ordered extents.  If
3028c2ecf20Sopenharmony_ci * a given ordered_extent is completely done, 1 is returned, otherwise
3038c2ecf20Sopenharmony_ci * 0.
3048c2ecf20Sopenharmony_ci *
3058c2ecf20Sopenharmony_ci * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
3068c2ecf20Sopenharmony_ci * to make sure this function only returns 1 once for a given ordered extent.
3078c2ecf20Sopenharmony_ci *
3088c2ecf20Sopenharmony_ci * file_offset is updated to one byte past the range that is recorded as
3098c2ecf20Sopenharmony_ci * complete.  This allows you to walk forward in the file.
3108c2ecf20Sopenharmony_ci */
3118c2ecf20Sopenharmony_ciint btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
3128c2ecf20Sopenharmony_ci				   struct btrfs_ordered_extent **cached,
3138c2ecf20Sopenharmony_ci				   u64 *file_offset, u64 io_size, int uptodate)
3148c2ecf20Sopenharmony_ci{
3158c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3168c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
3178c2ecf20Sopenharmony_ci	struct rb_node *node;
3188c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry = NULL;
3198c2ecf20Sopenharmony_ci	int ret;
3208c2ecf20Sopenharmony_ci	unsigned long flags;
3218c2ecf20Sopenharmony_ci	u64 dec_end;
3228c2ecf20Sopenharmony_ci	u64 dec_start;
3238c2ecf20Sopenharmony_ci	u64 to_dec;
3248c2ecf20Sopenharmony_ci
3258c2ecf20Sopenharmony_ci	spin_lock_irqsave(&tree->lock, flags);
3268c2ecf20Sopenharmony_ci	node = tree_search(tree, *file_offset);
3278c2ecf20Sopenharmony_ci	if (!node) {
3288c2ecf20Sopenharmony_ci		ret = 1;
3298c2ecf20Sopenharmony_ci		goto out;
3308c2ecf20Sopenharmony_ci	}
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
3338c2ecf20Sopenharmony_ci	if (!offset_in_entry(entry, *file_offset)) {
3348c2ecf20Sopenharmony_ci		ret = 1;
3358c2ecf20Sopenharmony_ci		goto out;
3368c2ecf20Sopenharmony_ci	}
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	dec_start = max(*file_offset, entry->file_offset);
3398c2ecf20Sopenharmony_ci	dec_end = min(*file_offset + io_size,
3408c2ecf20Sopenharmony_ci		      entry->file_offset + entry->num_bytes);
3418c2ecf20Sopenharmony_ci	*file_offset = dec_end;
3428c2ecf20Sopenharmony_ci	if (dec_start > dec_end) {
3438c2ecf20Sopenharmony_ci		btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
3448c2ecf20Sopenharmony_ci			   dec_start, dec_end);
3458c2ecf20Sopenharmony_ci	}
3468c2ecf20Sopenharmony_ci	to_dec = dec_end - dec_start;
3478c2ecf20Sopenharmony_ci	if (to_dec > entry->bytes_left) {
3488c2ecf20Sopenharmony_ci		btrfs_crit(fs_info,
3498c2ecf20Sopenharmony_ci			   "bad ordered accounting left %llu size %llu",
3508c2ecf20Sopenharmony_ci			   entry->bytes_left, to_dec);
3518c2ecf20Sopenharmony_ci	}
3528c2ecf20Sopenharmony_ci	entry->bytes_left -= to_dec;
3538c2ecf20Sopenharmony_ci	if (!uptodate)
3548c2ecf20Sopenharmony_ci		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	if (entry->bytes_left == 0) {
3578c2ecf20Sopenharmony_ci		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
3588c2ecf20Sopenharmony_ci		/* test_and_set_bit implies a barrier */
3598c2ecf20Sopenharmony_ci		cond_wake_up_nomb(&entry->wait);
3608c2ecf20Sopenharmony_ci	} else {
3618c2ecf20Sopenharmony_ci		ret = 1;
3628c2ecf20Sopenharmony_ci	}
3638c2ecf20Sopenharmony_ciout:
3648c2ecf20Sopenharmony_ci	if (!ret && cached && entry) {
3658c2ecf20Sopenharmony_ci		*cached = entry;
3668c2ecf20Sopenharmony_ci		refcount_inc(&entry->refs);
3678c2ecf20Sopenharmony_ci	}
3688c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&tree->lock, flags);
3698c2ecf20Sopenharmony_ci	return ret == 0;
3708c2ecf20Sopenharmony_ci}
3718c2ecf20Sopenharmony_ci
3728c2ecf20Sopenharmony_ci/*
3738c2ecf20Sopenharmony_ci * this is used to account for finished IO across a given range
3748c2ecf20Sopenharmony_ci * of the file.  The IO should not span ordered extents.  If
3758c2ecf20Sopenharmony_ci * a given ordered_extent is completely done, 1 is returned, otherwise
3768c2ecf20Sopenharmony_ci * 0.
3778c2ecf20Sopenharmony_ci *
3788c2ecf20Sopenharmony_ci * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
3798c2ecf20Sopenharmony_ci * to make sure this function only returns 1 once for a given ordered extent.
3808c2ecf20Sopenharmony_ci */
3818c2ecf20Sopenharmony_ciint btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
3828c2ecf20Sopenharmony_ci				   struct btrfs_ordered_extent **cached,
3838c2ecf20Sopenharmony_ci				   u64 file_offset, u64 io_size, int uptodate)
3848c2ecf20Sopenharmony_ci{
3858c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
3868c2ecf20Sopenharmony_ci	struct rb_node *node;
3878c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry = NULL;
3888c2ecf20Sopenharmony_ci	unsigned long flags;
3898c2ecf20Sopenharmony_ci	int ret;
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci	spin_lock_irqsave(&tree->lock, flags);
3928c2ecf20Sopenharmony_ci	if (cached && *cached) {
3938c2ecf20Sopenharmony_ci		entry = *cached;
3948c2ecf20Sopenharmony_ci		goto have_entry;
3958c2ecf20Sopenharmony_ci	}
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_ci	node = tree_search(tree, file_offset);
3988c2ecf20Sopenharmony_ci	if (!node) {
3998c2ecf20Sopenharmony_ci		ret = 1;
4008c2ecf20Sopenharmony_ci		goto out;
4018c2ecf20Sopenharmony_ci	}
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
4048c2ecf20Sopenharmony_cihave_entry:
4058c2ecf20Sopenharmony_ci	if (!offset_in_entry(entry, file_offset)) {
4068c2ecf20Sopenharmony_ci		ret = 1;
4078c2ecf20Sopenharmony_ci		goto out;
4088c2ecf20Sopenharmony_ci	}
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	if (io_size > entry->bytes_left) {
4118c2ecf20Sopenharmony_ci		btrfs_crit(inode->root->fs_info,
4128c2ecf20Sopenharmony_ci			   "bad ordered accounting left %llu size %llu",
4138c2ecf20Sopenharmony_ci		       entry->bytes_left, io_size);
4148c2ecf20Sopenharmony_ci	}
4158c2ecf20Sopenharmony_ci	entry->bytes_left -= io_size;
4168c2ecf20Sopenharmony_ci	if (!uptodate)
4178c2ecf20Sopenharmony_ci		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
4188c2ecf20Sopenharmony_ci
4198c2ecf20Sopenharmony_ci	if (entry->bytes_left == 0) {
4208c2ecf20Sopenharmony_ci		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
4218c2ecf20Sopenharmony_ci		/* test_and_set_bit implies a barrier */
4228c2ecf20Sopenharmony_ci		cond_wake_up_nomb(&entry->wait);
4238c2ecf20Sopenharmony_ci	} else {
4248c2ecf20Sopenharmony_ci		ret = 1;
4258c2ecf20Sopenharmony_ci	}
4268c2ecf20Sopenharmony_ciout:
4278c2ecf20Sopenharmony_ci	if (!ret && cached && entry) {
4288c2ecf20Sopenharmony_ci		*cached = entry;
4298c2ecf20Sopenharmony_ci		refcount_inc(&entry->refs);
4308c2ecf20Sopenharmony_ci	}
4318c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&tree->lock, flags);
4328c2ecf20Sopenharmony_ci	return ret == 0;
4338c2ecf20Sopenharmony_ci}
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_ci/*
4368c2ecf20Sopenharmony_ci * used to drop a reference on an ordered extent.  This will free
4378c2ecf20Sopenharmony_ci * the extent if the last reference is dropped
4388c2ecf20Sopenharmony_ci */
4398c2ecf20Sopenharmony_civoid btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
4408c2ecf20Sopenharmony_ci{
4418c2ecf20Sopenharmony_ci	struct list_head *cur;
4428c2ecf20Sopenharmony_ci	struct btrfs_ordered_sum *sum;
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	if (refcount_dec_and_test(&entry->refs)) {
4478c2ecf20Sopenharmony_ci		ASSERT(list_empty(&entry->root_extent_list));
4488c2ecf20Sopenharmony_ci		ASSERT(list_empty(&entry->log_list));
4498c2ecf20Sopenharmony_ci		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
4508c2ecf20Sopenharmony_ci		if (entry->inode)
4518c2ecf20Sopenharmony_ci			btrfs_add_delayed_iput(entry->inode);
4528c2ecf20Sopenharmony_ci		while (!list_empty(&entry->list)) {
4538c2ecf20Sopenharmony_ci			cur = entry->list.next;
4548c2ecf20Sopenharmony_ci			sum = list_entry(cur, struct btrfs_ordered_sum, list);
4558c2ecf20Sopenharmony_ci			list_del(&sum->list);
4568c2ecf20Sopenharmony_ci			kvfree(sum);
4578c2ecf20Sopenharmony_ci		}
4588c2ecf20Sopenharmony_ci		kmem_cache_free(btrfs_ordered_extent_cache, entry);
4598c2ecf20Sopenharmony_ci	}
4608c2ecf20Sopenharmony_ci}
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci/*
4638c2ecf20Sopenharmony_ci * remove an ordered extent from the tree.  No references are dropped
4648c2ecf20Sopenharmony_ci * and waiters are woken up.
4658c2ecf20Sopenharmony_ci */
4668c2ecf20Sopenharmony_civoid btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
4678c2ecf20Sopenharmony_ci				 struct btrfs_ordered_extent *entry)
4688c2ecf20Sopenharmony_ci{
4698c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree;
4708c2ecf20Sopenharmony_ci	struct btrfs_root *root = btrfs_inode->root;
4718c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = root->fs_info;
4728c2ecf20Sopenharmony_ci	struct rb_node *node;
4738c2ecf20Sopenharmony_ci	bool pending;
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci	/* This is paired with btrfs_add_ordered_extent. */
4768c2ecf20Sopenharmony_ci	spin_lock(&btrfs_inode->lock);
4778c2ecf20Sopenharmony_ci	btrfs_mod_outstanding_extents(btrfs_inode, -1);
4788c2ecf20Sopenharmony_ci	spin_unlock(&btrfs_inode->lock);
4798c2ecf20Sopenharmony_ci	if (root != fs_info->tree_root)
4808c2ecf20Sopenharmony_ci		btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
4818c2ecf20Sopenharmony_ci						false);
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci	if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
4848c2ecf20Sopenharmony_ci		percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
4858c2ecf20Sopenharmony_ci					 fs_info->delalloc_batch);
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_ci	tree = &btrfs_inode->ordered_tree;
4888c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
4898c2ecf20Sopenharmony_ci	node = &entry->rb_node;
4908c2ecf20Sopenharmony_ci	rb_erase(node, &tree->tree);
4918c2ecf20Sopenharmony_ci	RB_CLEAR_NODE(node);
4928c2ecf20Sopenharmony_ci	if (tree->last == node)
4938c2ecf20Sopenharmony_ci		tree->last = NULL;
4948c2ecf20Sopenharmony_ci	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
4958c2ecf20Sopenharmony_ci	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
4968c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
4978c2ecf20Sopenharmony_ci
4988c2ecf20Sopenharmony_ci	/*
4998c2ecf20Sopenharmony_ci	 * The current running transaction is waiting on us, we need to let it
5008c2ecf20Sopenharmony_ci	 * know that we're complete and wake it up.
5018c2ecf20Sopenharmony_ci	 */
5028c2ecf20Sopenharmony_ci	if (pending) {
5038c2ecf20Sopenharmony_ci		struct btrfs_transaction *trans;
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci		/*
5068c2ecf20Sopenharmony_ci		 * The checks for trans are just a formality, it should be set,
5078c2ecf20Sopenharmony_ci		 * but if it isn't we don't want to deref/assert under the spin
5088c2ecf20Sopenharmony_ci		 * lock, so be nice and check if trans is set, but ASSERT() so
5098c2ecf20Sopenharmony_ci		 * if it isn't set a developer will notice.
5108c2ecf20Sopenharmony_ci		 */
5118c2ecf20Sopenharmony_ci		spin_lock(&fs_info->trans_lock);
5128c2ecf20Sopenharmony_ci		trans = fs_info->running_transaction;
5138c2ecf20Sopenharmony_ci		if (trans)
5148c2ecf20Sopenharmony_ci			refcount_inc(&trans->use_count);
5158c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->trans_lock);
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_ci		ASSERT(trans);
5188c2ecf20Sopenharmony_ci		if (trans) {
5198c2ecf20Sopenharmony_ci			if (atomic_dec_and_test(&trans->pending_ordered))
5208c2ecf20Sopenharmony_ci				wake_up(&trans->pending_wait);
5218c2ecf20Sopenharmony_ci			btrfs_put_transaction(trans);
5228c2ecf20Sopenharmony_ci		}
5238c2ecf20Sopenharmony_ci	}
5248c2ecf20Sopenharmony_ci
5258c2ecf20Sopenharmony_ci	spin_lock(&root->ordered_extent_lock);
5268c2ecf20Sopenharmony_ci	list_del_init(&entry->root_extent_list);
5278c2ecf20Sopenharmony_ci	root->nr_ordered_extents--;
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	if (!root->nr_ordered_extents) {
5328c2ecf20Sopenharmony_ci		spin_lock(&fs_info->ordered_root_lock);
5338c2ecf20Sopenharmony_ci		BUG_ON(list_empty(&root->ordered_root));
5348c2ecf20Sopenharmony_ci		list_del_init(&root->ordered_root);
5358c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->ordered_root_lock);
5368c2ecf20Sopenharmony_ci	}
5378c2ecf20Sopenharmony_ci	spin_unlock(&root->ordered_extent_lock);
5388c2ecf20Sopenharmony_ci	wake_up(&entry->wait);
5398c2ecf20Sopenharmony_ci}
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_cistatic void btrfs_run_ordered_extent_work(struct btrfs_work *work)
5428c2ecf20Sopenharmony_ci{
5438c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *ordered;
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_ci	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
5468c2ecf20Sopenharmony_ci	btrfs_start_ordered_extent(ordered, 1);
5478c2ecf20Sopenharmony_ci	complete(&ordered->completion);
5488c2ecf20Sopenharmony_ci}
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_ci/*
5518c2ecf20Sopenharmony_ci * wait for all the ordered extents in a root.  This is done when balancing
5528c2ecf20Sopenharmony_ci * space between drives.
5538c2ecf20Sopenharmony_ci */
5548c2ecf20Sopenharmony_ciu64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
5558c2ecf20Sopenharmony_ci			       const u64 range_start, const u64 range_len)
5568c2ecf20Sopenharmony_ci{
5578c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = root->fs_info;
5588c2ecf20Sopenharmony_ci	LIST_HEAD(splice);
5598c2ecf20Sopenharmony_ci	LIST_HEAD(skipped);
5608c2ecf20Sopenharmony_ci	LIST_HEAD(works);
5618c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *ordered, *next;
5628c2ecf20Sopenharmony_ci	u64 count = 0;
5638c2ecf20Sopenharmony_ci	const u64 range_end = range_start + range_len;
5648c2ecf20Sopenharmony_ci
5658c2ecf20Sopenharmony_ci	mutex_lock(&root->ordered_extent_mutex);
5668c2ecf20Sopenharmony_ci	spin_lock(&root->ordered_extent_lock);
5678c2ecf20Sopenharmony_ci	list_splice_init(&root->ordered_extents, &splice);
5688c2ecf20Sopenharmony_ci	while (!list_empty(&splice) && nr) {
5698c2ecf20Sopenharmony_ci		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
5708c2ecf20Sopenharmony_ci					   root_extent_list);
5718c2ecf20Sopenharmony_ci
5728c2ecf20Sopenharmony_ci		if (range_end <= ordered->disk_bytenr ||
5738c2ecf20Sopenharmony_ci		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
5748c2ecf20Sopenharmony_ci			list_move_tail(&ordered->root_extent_list, &skipped);
5758c2ecf20Sopenharmony_ci			cond_resched_lock(&root->ordered_extent_lock);
5768c2ecf20Sopenharmony_ci			continue;
5778c2ecf20Sopenharmony_ci		}
5788c2ecf20Sopenharmony_ci
5798c2ecf20Sopenharmony_ci		list_move_tail(&ordered->root_extent_list,
5808c2ecf20Sopenharmony_ci			       &root->ordered_extents);
5818c2ecf20Sopenharmony_ci		refcount_inc(&ordered->refs);
5828c2ecf20Sopenharmony_ci		spin_unlock(&root->ordered_extent_lock);
5838c2ecf20Sopenharmony_ci
5848c2ecf20Sopenharmony_ci		btrfs_init_work(&ordered->flush_work,
5858c2ecf20Sopenharmony_ci				btrfs_run_ordered_extent_work, NULL, NULL);
5868c2ecf20Sopenharmony_ci		list_add_tail(&ordered->work_list, &works);
5878c2ecf20Sopenharmony_ci		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
5888c2ecf20Sopenharmony_ci
5898c2ecf20Sopenharmony_ci		cond_resched();
5908c2ecf20Sopenharmony_ci		spin_lock(&root->ordered_extent_lock);
5918c2ecf20Sopenharmony_ci		if (nr != U64_MAX)
5928c2ecf20Sopenharmony_ci			nr--;
5938c2ecf20Sopenharmony_ci		count++;
5948c2ecf20Sopenharmony_ci	}
5958c2ecf20Sopenharmony_ci	list_splice_tail(&skipped, &root->ordered_extents);
5968c2ecf20Sopenharmony_ci	list_splice_tail(&splice, &root->ordered_extents);
5978c2ecf20Sopenharmony_ci	spin_unlock(&root->ordered_extent_lock);
5988c2ecf20Sopenharmony_ci
5998c2ecf20Sopenharmony_ci	list_for_each_entry_safe(ordered, next, &works, work_list) {
6008c2ecf20Sopenharmony_ci		list_del_init(&ordered->work_list);
6018c2ecf20Sopenharmony_ci		wait_for_completion(&ordered->completion);
6028c2ecf20Sopenharmony_ci		btrfs_put_ordered_extent(ordered);
6038c2ecf20Sopenharmony_ci		cond_resched();
6048c2ecf20Sopenharmony_ci	}
6058c2ecf20Sopenharmony_ci	mutex_unlock(&root->ordered_extent_mutex);
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci	return count;
6088c2ecf20Sopenharmony_ci}
6098c2ecf20Sopenharmony_ci
6108c2ecf20Sopenharmony_civoid btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
6118c2ecf20Sopenharmony_ci			     const u64 range_start, const u64 range_len)
6128c2ecf20Sopenharmony_ci{
6138c2ecf20Sopenharmony_ci	struct btrfs_root *root;
6148c2ecf20Sopenharmony_ci	struct list_head splice;
6158c2ecf20Sopenharmony_ci	u64 done;
6168c2ecf20Sopenharmony_ci
6178c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&splice);
6188c2ecf20Sopenharmony_ci
6198c2ecf20Sopenharmony_ci	mutex_lock(&fs_info->ordered_operations_mutex);
6208c2ecf20Sopenharmony_ci	spin_lock(&fs_info->ordered_root_lock);
6218c2ecf20Sopenharmony_ci	list_splice_init(&fs_info->ordered_roots, &splice);
6228c2ecf20Sopenharmony_ci	while (!list_empty(&splice) && nr) {
6238c2ecf20Sopenharmony_ci		root = list_first_entry(&splice, struct btrfs_root,
6248c2ecf20Sopenharmony_ci					ordered_root);
6258c2ecf20Sopenharmony_ci		root = btrfs_grab_root(root);
6268c2ecf20Sopenharmony_ci		BUG_ON(!root);
6278c2ecf20Sopenharmony_ci		list_move_tail(&root->ordered_root,
6288c2ecf20Sopenharmony_ci			       &fs_info->ordered_roots);
6298c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->ordered_root_lock);
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci		done = btrfs_wait_ordered_extents(root, nr,
6328c2ecf20Sopenharmony_ci						  range_start, range_len);
6338c2ecf20Sopenharmony_ci		btrfs_put_root(root);
6348c2ecf20Sopenharmony_ci
6358c2ecf20Sopenharmony_ci		spin_lock(&fs_info->ordered_root_lock);
6368c2ecf20Sopenharmony_ci		if (nr != U64_MAX) {
6378c2ecf20Sopenharmony_ci			nr -= done;
6388c2ecf20Sopenharmony_ci		}
6398c2ecf20Sopenharmony_ci	}
6408c2ecf20Sopenharmony_ci	list_splice_tail(&splice, &fs_info->ordered_roots);
6418c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->ordered_root_lock);
6428c2ecf20Sopenharmony_ci	mutex_unlock(&fs_info->ordered_operations_mutex);
6438c2ecf20Sopenharmony_ci}
6448c2ecf20Sopenharmony_ci
6458c2ecf20Sopenharmony_ci/*
6468c2ecf20Sopenharmony_ci * Used to start IO or wait for a given ordered extent to finish.
6478c2ecf20Sopenharmony_ci *
6488c2ecf20Sopenharmony_ci * If wait is one, this effectively waits on page writeback for all the pages
6498c2ecf20Sopenharmony_ci * in the extent, and it waits on the io completion code to insert
6508c2ecf20Sopenharmony_ci * metadata into the btree corresponding to the extent
6518c2ecf20Sopenharmony_ci */
6528c2ecf20Sopenharmony_civoid btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
6538c2ecf20Sopenharmony_ci{
6548c2ecf20Sopenharmony_ci	u64 start = entry->file_offset;
6558c2ecf20Sopenharmony_ci	u64 end = start + entry->num_bytes - 1;
6568c2ecf20Sopenharmony_ci	struct btrfs_inode *inode = BTRFS_I(entry->inode);
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_ci	trace_btrfs_ordered_extent_start(inode, entry);
6598c2ecf20Sopenharmony_ci
6608c2ecf20Sopenharmony_ci	/*
6618c2ecf20Sopenharmony_ci	 * pages in the range can be dirty, clean or writeback.  We
6628c2ecf20Sopenharmony_ci	 * start IO on any dirty ones so the wait doesn't stall waiting
6638c2ecf20Sopenharmony_ci	 * for the flusher thread to find them
6648c2ecf20Sopenharmony_ci	 */
6658c2ecf20Sopenharmony_ci	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
6668c2ecf20Sopenharmony_ci		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
6678c2ecf20Sopenharmony_ci	if (wait) {
6688c2ecf20Sopenharmony_ci		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
6698c2ecf20Sopenharmony_ci						 &entry->flags));
6708c2ecf20Sopenharmony_ci	}
6718c2ecf20Sopenharmony_ci}
6728c2ecf20Sopenharmony_ci
6738c2ecf20Sopenharmony_ci/*
6748c2ecf20Sopenharmony_ci * Used to wait on ordered extents across a large range of bytes.
6758c2ecf20Sopenharmony_ci */
6768c2ecf20Sopenharmony_ciint btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
6778c2ecf20Sopenharmony_ci{
6788c2ecf20Sopenharmony_ci	int ret = 0;
6798c2ecf20Sopenharmony_ci	int ret_wb = 0;
6808c2ecf20Sopenharmony_ci	u64 end;
6818c2ecf20Sopenharmony_ci	u64 orig_end;
6828c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *ordered;
6838c2ecf20Sopenharmony_ci
6848c2ecf20Sopenharmony_ci	if (start + len < start) {
6858c2ecf20Sopenharmony_ci		orig_end = INT_LIMIT(loff_t);
6868c2ecf20Sopenharmony_ci	} else {
6878c2ecf20Sopenharmony_ci		orig_end = start + len - 1;
6888c2ecf20Sopenharmony_ci		if (orig_end > INT_LIMIT(loff_t))
6898c2ecf20Sopenharmony_ci			orig_end = INT_LIMIT(loff_t);
6908c2ecf20Sopenharmony_ci	}
6918c2ecf20Sopenharmony_ci
6928c2ecf20Sopenharmony_ci	/* start IO across the range first to instantiate any delalloc
6938c2ecf20Sopenharmony_ci	 * extents
6948c2ecf20Sopenharmony_ci	 */
6958c2ecf20Sopenharmony_ci	ret = btrfs_fdatawrite_range(inode, start, orig_end);
6968c2ecf20Sopenharmony_ci	if (ret)
6978c2ecf20Sopenharmony_ci		return ret;
6988c2ecf20Sopenharmony_ci
6998c2ecf20Sopenharmony_ci	/*
7008c2ecf20Sopenharmony_ci	 * If we have a writeback error don't return immediately. Wait first
7018c2ecf20Sopenharmony_ci	 * for any ordered extents that haven't completed yet. This is to make
7028c2ecf20Sopenharmony_ci	 * sure no one can dirty the same page ranges and call writepages()
7038c2ecf20Sopenharmony_ci	 * before the ordered extents complete - to avoid failures (-EEXIST)
7048c2ecf20Sopenharmony_ci	 * when adding the new ordered extents to the ordered tree.
7058c2ecf20Sopenharmony_ci	 */
7068c2ecf20Sopenharmony_ci	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
7078c2ecf20Sopenharmony_ci
7088c2ecf20Sopenharmony_ci	end = orig_end;
7098c2ecf20Sopenharmony_ci	while (1) {
7108c2ecf20Sopenharmony_ci		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
7118c2ecf20Sopenharmony_ci		if (!ordered)
7128c2ecf20Sopenharmony_ci			break;
7138c2ecf20Sopenharmony_ci		if (ordered->file_offset > orig_end) {
7148c2ecf20Sopenharmony_ci			btrfs_put_ordered_extent(ordered);
7158c2ecf20Sopenharmony_ci			break;
7168c2ecf20Sopenharmony_ci		}
7178c2ecf20Sopenharmony_ci		if (ordered->file_offset + ordered->num_bytes <= start) {
7188c2ecf20Sopenharmony_ci			btrfs_put_ordered_extent(ordered);
7198c2ecf20Sopenharmony_ci			break;
7208c2ecf20Sopenharmony_ci		}
7218c2ecf20Sopenharmony_ci		btrfs_start_ordered_extent(ordered, 1);
7228c2ecf20Sopenharmony_ci		end = ordered->file_offset;
7238c2ecf20Sopenharmony_ci		/*
7248c2ecf20Sopenharmony_ci		 * If the ordered extent had an error save the error but don't
7258c2ecf20Sopenharmony_ci		 * exit without waiting first for all other ordered extents in
7268c2ecf20Sopenharmony_ci		 * the range to complete.
7278c2ecf20Sopenharmony_ci		 */
7288c2ecf20Sopenharmony_ci		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
7298c2ecf20Sopenharmony_ci			ret = -EIO;
7308c2ecf20Sopenharmony_ci		btrfs_put_ordered_extent(ordered);
7318c2ecf20Sopenharmony_ci		if (end == 0 || end == start)
7328c2ecf20Sopenharmony_ci			break;
7338c2ecf20Sopenharmony_ci		end--;
7348c2ecf20Sopenharmony_ci	}
7358c2ecf20Sopenharmony_ci	return ret_wb ? ret_wb : ret;
7368c2ecf20Sopenharmony_ci}
7378c2ecf20Sopenharmony_ci
7388c2ecf20Sopenharmony_ci/*
7398c2ecf20Sopenharmony_ci * find an ordered extent corresponding to file_offset.  return NULL if
7408c2ecf20Sopenharmony_ci * nothing is found, otherwise take a reference on the extent and return it
7418c2ecf20Sopenharmony_ci */
7428c2ecf20Sopenharmony_cistruct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
7438c2ecf20Sopenharmony_ci							 u64 file_offset)
7448c2ecf20Sopenharmony_ci{
7458c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree;
7468c2ecf20Sopenharmony_ci	struct rb_node *node;
7478c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry = NULL;
7488c2ecf20Sopenharmony_ci
7498c2ecf20Sopenharmony_ci	tree = &inode->ordered_tree;
7508c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
7518c2ecf20Sopenharmony_ci	node = tree_search(tree, file_offset);
7528c2ecf20Sopenharmony_ci	if (!node)
7538c2ecf20Sopenharmony_ci		goto out;
7548c2ecf20Sopenharmony_ci
7558c2ecf20Sopenharmony_ci	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
7568c2ecf20Sopenharmony_ci	if (!offset_in_entry(entry, file_offset))
7578c2ecf20Sopenharmony_ci		entry = NULL;
7588c2ecf20Sopenharmony_ci	if (entry)
7598c2ecf20Sopenharmony_ci		refcount_inc(&entry->refs);
7608c2ecf20Sopenharmony_ciout:
7618c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
7628c2ecf20Sopenharmony_ci	return entry;
7638c2ecf20Sopenharmony_ci}
7648c2ecf20Sopenharmony_ci
7658c2ecf20Sopenharmony_ci/* Since the DIO code tries to lock a wide area we need to look for any ordered
7668c2ecf20Sopenharmony_ci * extents that exist in the range, rather than just the start of the range.
7678c2ecf20Sopenharmony_ci */
7688c2ecf20Sopenharmony_cistruct btrfs_ordered_extent *btrfs_lookup_ordered_range(
7698c2ecf20Sopenharmony_ci		struct btrfs_inode *inode, u64 file_offset, u64 len)
7708c2ecf20Sopenharmony_ci{
7718c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree;
7728c2ecf20Sopenharmony_ci	struct rb_node *node;
7738c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry = NULL;
7748c2ecf20Sopenharmony_ci
7758c2ecf20Sopenharmony_ci	tree = &inode->ordered_tree;
7768c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
7778c2ecf20Sopenharmony_ci	node = tree_search(tree, file_offset);
7788c2ecf20Sopenharmony_ci	if (!node) {
7798c2ecf20Sopenharmony_ci		node = tree_search(tree, file_offset + len);
7808c2ecf20Sopenharmony_ci		if (!node)
7818c2ecf20Sopenharmony_ci			goto out;
7828c2ecf20Sopenharmony_ci	}
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci	while (1) {
7858c2ecf20Sopenharmony_ci		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
7868c2ecf20Sopenharmony_ci		if (range_overlaps(entry, file_offset, len))
7878c2ecf20Sopenharmony_ci			break;
7888c2ecf20Sopenharmony_ci
7898c2ecf20Sopenharmony_ci		if (entry->file_offset >= file_offset + len) {
7908c2ecf20Sopenharmony_ci			entry = NULL;
7918c2ecf20Sopenharmony_ci			break;
7928c2ecf20Sopenharmony_ci		}
7938c2ecf20Sopenharmony_ci		entry = NULL;
7948c2ecf20Sopenharmony_ci		node = rb_next(node);
7958c2ecf20Sopenharmony_ci		if (!node)
7968c2ecf20Sopenharmony_ci			break;
7978c2ecf20Sopenharmony_ci	}
7988c2ecf20Sopenharmony_ciout:
7998c2ecf20Sopenharmony_ci	if (entry)
8008c2ecf20Sopenharmony_ci		refcount_inc(&entry->refs);
8018c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
8028c2ecf20Sopenharmony_ci	return entry;
8038c2ecf20Sopenharmony_ci}
8048c2ecf20Sopenharmony_ci
8058c2ecf20Sopenharmony_ci/*
8068c2ecf20Sopenharmony_ci * Adds all ordered extents to the given list. The list ends up sorted by the
8078c2ecf20Sopenharmony_ci * file_offset of the ordered extents.
8088c2ecf20Sopenharmony_ci */
8098c2ecf20Sopenharmony_civoid btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
8108c2ecf20Sopenharmony_ci					   struct list_head *list)
8118c2ecf20Sopenharmony_ci{
8128c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
8138c2ecf20Sopenharmony_ci	struct rb_node *n;
8148c2ecf20Sopenharmony_ci
8158c2ecf20Sopenharmony_ci	ASSERT(inode_is_locked(&inode->vfs_inode));
8168c2ecf20Sopenharmony_ci
8178c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
8188c2ecf20Sopenharmony_ci	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
8198c2ecf20Sopenharmony_ci		struct btrfs_ordered_extent *ordered;
8208c2ecf20Sopenharmony_ci
8218c2ecf20Sopenharmony_ci		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
8228c2ecf20Sopenharmony_ci
8238c2ecf20Sopenharmony_ci		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
8248c2ecf20Sopenharmony_ci			continue;
8258c2ecf20Sopenharmony_ci
8268c2ecf20Sopenharmony_ci		ASSERT(list_empty(&ordered->log_list));
8278c2ecf20Sopenharmony_ci		list_add_tail(&ordered->log_list, list);
8288c2ecf20Sopenharmony_ci		refcount_inc(&ordered->refs);
8298c2ecf20Sopenharmony_ci	}
8308c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
8318c2ecf20Sopenharmony_ci}
8328c2ecf20Sopenharmony_ci
8338c2ecf20Sopenharmony_ci/*
8348c2ecf20Sopenharmony_ci * lookup and return any extent before 'file_offset'.  NULL is returned
8358c2ecf20Sopenharmony_ci * if none is found
8368c2ecf20Sopenharmony_ci */
8378c2ecf20Sopenharmony_cistruct btrfs_ordered_extent *
8388c2ecf20Sopenharmony_cibtrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
8398c2ecf20Sopenharmony_ci{
8408c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree;
8418c2ecf20Sopenharmony_ci	struct rb_node *node;
8428c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *entry = NULL;
8438c2ecf20Sopenharmony_ci
8448c2ecf20Sopenharmony_ci	tree = &inode->ordered_tree;
8458c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
8468c2ecf20Sopenharmony_ci	node = tree_search(tree, file_offset);
8478c2ecf20Sopenharmony_ci	if (!node)
8488c2ecf20Sopenharmony_ci		goto out;
8498c2ecf20Sopenharmony_ci
8508c2ecf20Sopenharmony_ci	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
8518c2ecf20Sopenharmony_ci	refcount_inc(&entry->refs);
8528c2ecf20Sopenharmony_ciout:
8538c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
8548c2ecf20Sopenharmony_ci	return entry;
8558c2ecf20Sopenharmony_ci}
8568c2ecf20Sopenharmony_ci
8578c2ecf20Sopenharmony_ci/*
8588c2ecf20Sopenharmony_ci * search the ordered extents for one corresponding to 'offset' and
8598c2ecf20Sopenharmony_ci * try to find a checksum.  This is used because we allow pages to
8608c2ecf20Sopenharmony_ci * be reclaimed before their checksum is actually put into the btree
8618c2ecf20Sopenharmony_ci */
8628c2ecf20Sopenharmony_ciint btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
8638c2ecf20Sopenharmony_ci			   u64 disk_bytenr, u8 *sum, int len)
8648c2ecf20Sopenharmony_ci{
8658c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = inode->root->fs_info;
8668c2ecf20Sopenharmony_ci	struct btrfs_ordered_sum *ordered_sum;
8678c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *ordered;
8688c2ecf20Sopenharmony_ci	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
8698c2ecf20Sopenharmony_ci	unsigned long num_sectors;
8708c2ecf20Sopenharmony_ci	unsigned long i;
8718c2ecf20Sopenharmony_ci	u32 sectorsize = btrfs_inode_sectorsize(inode);
8728c2ecf20Sopenharmony_ci	const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
8738c2ecf20Sopenharmony_ci	const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
8748c2ecf20Sopenharmony_ci	int index = 0;
8758c2ecf20Sopenharmony_ci
8768c2ecf20Sopenharmony_ci	ordered = btrfs_lookup_ordered_extent(inode, offset);
8778c2ecf20Sopenharmony_ci	if (!ordered)
8788c2ecf20Sopenharmony_ci		return 0;
8798c2ecf20Sopenharmony_ci
8808c2ecf20Sopenharmony_ci	spin_lock_irq(&tree->lock);
8818c2ecf20Sopenharmony_ci	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
8828c2ecf20Sopenharmony_ci		if (disk_bytenr >= ordered_sum->bytenr &&
8838c2ecf20Sopenharmony_ci		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
8848c2ecf20Sopenharmony_ci			i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
8858c2ecf20Sopenharmony_ci			num_sectors = ordered_sum->len >> blocksize_bits;
8868c2ecf20Sopenharmony_ci			num_sectors = min_t(int, len - index, num_sectors - i);
8878c2ecf20Sopenharmony_ci			memcpy(sum + index, ordered_sum->sums + i * csum_size,
8888c2ecf20Sopenharmony_ci			       num_sectors * csum_size);
8898c2ecf20Sopenharmony_ci
8908c2ecf20Sopenharmony_ci			index += (int)num_sectors * csum_size;
8918c2ecf20Sopenharmony_ci			if (index == len)
8928c2ecf20Sopenharmony_ci				goto out;
8938c2ecf20Sopenharmony_ci			disk_bytenr += num_sectors * sectorsize;
8948c2ecf20Sopenharmony_ci		}
8958c2ecf20Sopenharmony_ci	}
8968c2ecf20Sopenharmony_ciout:
8978c2ecf20Sopenharmony_ci	spin_unlock_irq(&tree->lock);
8988c2ecf20Sopenharmony_ci	btrfs_put_ordered_extent(ordered);
8998c2ecf20Sopenharmony_ci	return index;
9008c2ecf20Sopenharmony_ci}
9018c2ecf20Sopenharmony_ci
9028c2ecf20Sopenharmony_ci/*
9038c2ecf20Sopenharmony_ci * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
9048c2ecf20Sopenharmony_ci * ordered extents in it are run to completion.
9058c2ecf20Sopenharmony_ci *
9068c2ecf20Sopenharmony_ci * @inode:        Inode whose ordered tree is to be searched
9078c2ecf20Sopenharmony_ci * @start:        Beginning of range to flush
9088c2ecf20Sopenharmony_ci * @end:          Last byte of range to lock
9098c2ecf20Sopenharmony_ci * @cached_state: If passed, will return the extent state responsible for the
9108c2ecf20Sopenharmony_ci * locked range. It's the caller's responsibility to free the cached state.
9118c2ecf20Sopenharmony_ci *
9128c2ecf20Sopenharmony_ci * This function always returns with the given range locked, ensuring after it's
9138c2ecf20Sopenharmony_ci * called no order extent can be pending.
9148c2ecf20Sopenharmony_ci */
9158c2ecf20Sopenharmony_civoid btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
9168c2ecf20Sopenharmony_ci					u64 end,
9178c2ecf20Sopenharmony_ci					struct extent_state **cached_state)
9188c2ecf20Sopenharmony_ci{
9198c2ecf20Sopenharmony_ci	struct btrfs_ordered_extent *ordered;
9208c2ecf20Sopenharmony_ci	struct extent_state *cache = NULL;
9218c2ecf20Sopenharmony_ci	struct extent_state **cachedp = &cache;
9228c2ecf20Sopenharmony_ci
9238c2ecf20Sopenharmony_ci	if (cached_state)
9248c2ecf20Sopenharmony_ci		cachedp = cached_state;
9258c2ecf20Sopenharmony_ci
9268c2ecf20Sopenharmony_ci	while (1) {
9278c2ecf20Sopenharmony_ci		lock_extent_bits(&inode->io_tree, start, end, cachedp);
9288c2ecf20Sopenharmony_ci		ordered = btrfs_lookup_ordered_range(inode, start,
9298c2ecf20Sopenharmony_ci						     end - start + 1);
9308c2ecf20Sopenharmony_ci		if (!ordered) {
9318c2ecf20Sopenharmony_ci			/*
9328c2ecf20Sopenharmony_ci			 * If no external cached_state has been passed then
9338c2ecf20Sopenharmony_ci			 * decrement the extra ref taken for cachedp since we
9348c2ecf20Sopenharmony_ci			 * aren't exposing it outside of this function
9358c2ecf20Sopenharmony_ci			 */
9368c2ecf20Sopenharmony_ci			if (!cached_state)
9378c2ecf20Sopenharmony_ci				refcount_dec(&cache->refs);
9388c2ecf20Sopenharmony_ci			break;
9398c2ecf20Sopenharmony_ci		}
9408c2ecf20Sopenharmony_ci		unlock_extent_cached(&inode->io_tree, start, end, cachedp);
9418c2ecf20Sopenharmony_ci		btrfs_start_ordered_extent(ordered, 1);
9428c2ecf20Sopenharmony_ci		btrfs_put_ordered_extent(ordered);
9438c2ecf20Sopenharmony_ci	}
9448c2ecf20Sopenharmony_ci}
9458c2ecf20Sopenharmony_ci
9468c2ecf20Sopenharmony_ciint __init ordered_data_init(void)
9478c2ecf20Sopenharmony_ci{
9488c2ecf20Sopenharmony_ci	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
9498c2ecf20Sopenharmony_ci				     sizeof(struct btrfs_ordered_extent), 0,
9508c2ecf20Sopenharmony_ci				     SLAB_MEM_SPREAD,
9518c2ecf20Sopenharmony_ci				     NULL);
9528c2ecf20Sopenharmony_ci	if (!btrfs_ordered_extent_cache)
9538c2ecf20Sopenharmony_ci		return -ENOMEM;
9548c2ecf20Sopenharmony_ci
9558c2ecf20Sopenharmony_ci	return 0;
9568c2ecf20Sopenharmony_ci}
9578c2ecf20Sopenharmony_ci
9588c2ecf20Sopenharmony_civoid __cold ordered_data_exit(void)
9598c2ecf20Sopenharmony_ci{
9608c2ecf20Sopenharmony_ci	kmem_cache_destroy(btrfs_ordered_extent_cache);
9618c2ecf20Sopenharmony_ci}
962