18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (C) 2011 STRATO.  All rights reserved.
48c2ecf20Sopenharmony_ci */
58c2ecf20Sopenharmony_ci
68c2ecf20Sopenharmony_ci#include <linux/sched.h>
78c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
88c2ecf20Sopenharmony_ci#include <linux/writeback.h>
98c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
108c2ecf20Sopenharmony_ci#include <linux/slab.h>
118c2ecf20Sopenharmony_ci#include <linux/workqueue.h>
128c2ecf20Sopenharmony_ci#include "ctree.h"
138c2ecf20Sopenharmony_ci#include "volumes.h"
148c2ecf20Sopenharmony_ci#include "disk-io.h"
158c2ecf20Sopenharmony_ci#include "transaction.h"
168c2ecf20Sopenharmony_ci#include "dev-replace.h"
178c2ecf20Sopenharmony_ci#include "block-group.h"
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci#undef DEBUG
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci/*
228c2ecf20Sopenharmony_ci * This is the implementation for the generic read ahead framework.
238c2ecf20Sopenharmony_ci *
248c2ecf20Sopenharmony_ci * To trigger a readahead, btrfs_reada_add must be called. It will start
258c2ecf20Sopenharmony_ci * a read ahead for the given range [start, end) on tree root. The returned
268c2ecf20Sopenharmony_ci * handle can either be used to wait on the readahead to finish
278c2ecf20Sopenharmony_ci * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
288c2ecf20Sopenharmony_ci *
298c2ecf20Sopenharmony_ci * The read ahead works as follows:
308c2ecf20Sopenharmony_ci * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
318c2ecf20Sopenharmony_ci * reada_start_machine will then search for extents to prefetch and trigger
328c2ecf20Sopenharmony_ci * some reads. When a read finishes for a node, all contained node/leaf
338c2ecf20Sopenharmony_ci * pointers that lie in the given range will also be enqueued. The reads will
348c2ecf20Sopenharmony_ci * be triggered in sequential order, thus giving a big win over a naive
358c2ecf20Sopenharmony_ci * enumeration. It will also make use of multi-device layouts. Each disk
368c2ecf20Sopenharmony_ci * will have its on read pointer and all disks will by utilized in parallel.
378c2ecf20Sopenharmony_ci * Also will no two disks read both sides of a mirror simultaneously, as this
388c2ecf20Sopenharmony_ci * would waste seeking capacity. Instead both disks will read different parts
398c2ecf20Sopenharmony_ci * of the filesystem.
408c2ecf20Sopenharmony_ci * Any number of readaheads can be started in parallel. The read order will be
418c2ecf20Sopenharmony_ci * determined globally, i.e. 2 parallel readaheads will normally finish faster
428c2ecf20Sopenharmony_ci * than the 2 started one after another.
438c2ecf20Sopenharmony_ci */
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci#define MAX_IN_FLIGHT 6
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_cistruct reada_extctl {
488c2ecf20Sopenharmony_ci	struct list_head	list;
498c2ecf20Sopenharmony_ci	struct reada_control	*rc;
508c2ecf20Sopenharmony_ci	u64			generation;
518c2ecf20Sopenharmony_ci};
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_cistruct reada_extent {
548c2ecf20Sopenharmony_ci	u64			logical;
558c2ecf20Sopenharmony_ci	struct btrfs_key	top;
568c2ecf20Sopenharmony_ci	struct list_head	extctl;
578c2ecf20Sopenharmony_ci	int 			refcnt;
588c2ecf20Sopenharmony_ci	spinlock_t		lock;
598c2ecf20Sopenharmony_ci	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
608c2ecf20Sopenharmony_ci	int			nzones;
618c2ecf20Sopenharmony_ci	int			scheduled;
628c2ecf20Sopenharmony_ci};
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_cistruct reada_zone {
658c2ecf20Sopenharmony_ci	u64			start;
668c2ecf20Sopenharmony_ci	u64			end;
678c2ecf20Sopenharmony_ci	u64			elems;
688c2ecf20Sopenharmony_ci	struct list_head	list;
698c2ecf20Sopenharmony_ci	spinlock_t		lock;
708c2ecf20Sopenharmony_ci	int			locked;
718c2ecf20Sopenharmony_ci	struct btrfs_device	*device;
728c2ecf20Sopenharmony_ci	struct btrfs_device	*devs[BTRFS_MAX_MIRRORS]; /* full list, incl
738c2ecf20Sopenharmony_ci							   * self */
748c2ecf20Sopenharmony_ci	int			ndevs;
758c2ecf20Sopenharmony_ci	struct kref		refcnt;
768c2ecf20Sopenharmony_ci};
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_cistruct reada_machine_work {
798c2ecf20Sopenharmony_ci	struct btrfs_work	work;
808c2ecf20Sopenharmony_ci	struct btrfs_fs_info	*fs_info;
818c2ecf20Sopenharmony_ci};
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_cistatic void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
848c2ecf20Sopenharmony_cistatic void reada_control_release(struct kref *kref);
858c2ecf20Sopenharmony_cistatic void reada_zone_release(struct kref *kref);
868c2ecf20Sopenharmony_cistatic void reada_start_machine(struct btrfs_fs_info *fs_info);
878c2ecf20Sopenharmony_cistatic void __reada_start_machine(struct btrfs_fs_info *fs_info);
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_cistatic int reada_add_block(struct reada_control *rc, u64 logical,
908c2ecf20Sopenharmony_ci			   struct btrfs_key *top, u64 generation);
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci/* recurses */
938c2ecf20Sopenharmony_ci/* in case of err, eb might be NULL */
948c2ecf20Sopenharmony_cistatic void __readahead_hook(struct btrfs_fs_info *fs_info,
958c2ecf20Sopenharmony_ci			     struct reada_extent *re, struct extent_buffer *eb,
968c2ecf20Sopenharmony_ci			     int err)
978c2ecf20Sopenharmony_ci{
988c2ecf20Sopenharmony_ci	int nritems;
998c2ecf20Sopenharmony_ci	int i;
1008c2ecf20Sopenharmony_ci	u64 bytenr;
1018c2ecf20Sopenharmony_ci	u64 generation;
1028c2ecf20Sopenharmony_ci	struct list_head list;
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	spin_lock(&re->lock);
1058c2ecf20Sopenharmony_ci	/*
1068c2ecf20Sopenharmony_ci	 * just take the full list from the extent. afterwards we
1078c2ecf20Sopenharmony_ci	 * don't need the lock anymore
1088c2ecf20Sopenharmony_ci	 */
1098c2ecf20Sopenharmony_ci	list_replace_init(&re->extctl, &list);
1108c2ecf20Sopenharmony_ci	re->scheduled = 0;
1118c2ecf20Sopenharmony_ci	spin_unlock(&re->lock);
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	/*
1148c2ecf20Sopenharmony_ci	 * this is the error case, the extent buffer has not been
1158c2ecf20Sopenharmony_ci	 * read correctly. We won't access anything from it and
1168c2ecf20Sopenharmony_ci	 * just cleanup our data structures. Effectively this will
1178c2ecf20Sopenharmony_ci	 * cut the branch below this node from read ahead.
1188c2ecf20Sopenharmony_ci	 */
1198c2ecf20Sopenharmony_ci	if (err)
1208c2ecf20Sopenharmony_ci		goto cleanup;
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	/*
1238c2ecf20Sopenharmony_ci	 * FIXME: currently we just set nritems to 0 if this is a leaf,
1248c2ecf20Sopenharmony_ci	 * effectively ignoring the content. In a next step we could
1258c2ecf20Sopenharmony_ci	 * trigger more readahead depending from the content, e.g.
1268c2ecf20Sopenharmony_ci	 * fetch the checksums for the extents in the leaf.
1278c2ecf20Sopenharmony_ci	 */
1288c2ecf20Sopenharmony_ci	if (!btrfs_header_level(eb))
1298c2ecf20Sopenharmony_ci		goto cleanup;
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci	nritems = btrfs_header_nritems(eb);
1328c2ecf20Sopenharmony_ci	generation = btrfs_header_generation(eb);
1338c2ecf20Sopenharmony_ci	for (i = 0; i < nritems; i++) {
1348c2ecf20Sopenharmony_ci		struct reada_extctl *rec;
1358c2ecf20Sopenharmony_ci		u64 n_gen;
1368c2ecf20Sopenharmony_ci		struct btrfs_key key;
1378c2ecf20Sopenharmony_ci		struct btrfs_key next_key;
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_ci		btrfs_node_key_to_cpu(eb, &key, i);
1408c2ecf20Sopenharmony_ci		if (i + 1 < nritems)
1418c2ecf20Sopenharmony_ci			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
1428c2ecf20Sopenharmony_ci		else
1438c2ecf20Sopenharmony_ci			next_key = re->top;
1448c2ecf20Sopenharmony_ci		bytenr = btrfs_node_blockptr(eb, i);
1458c2ecf20Sopenharmony_ci		n_gen = btrfs_node_ptr_generation(eb, i);
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci		list_for_each_entry(rec, &list, list) {
1488c2ecf20Sopenharmony_ci			struct reada_control *rc = rec->rc;
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci			/*
1518c2ecf20Sopenharmony_ci			 * if the generation doesn't match, just ignore this
1528c2ecf20Sopenharmony_ci			 * extctl. This will probably cut off a branch from
1538c2ecf20Sopenharmony_ci			 * prefetch. Alternatively one could start a new (sub-)
1548c2ecf20Sopenharmony_ci			 * prefetch for this branch, starting again from root.
1558c2ecf20Sopenharmony_ci			 * FIXME: move the generation check out of this loop
1568c2ecf20Sopenharmony_ci			 */
1578c2ecf20Sopenharmony_ci#ifdef DEBUG
1588c2ecf20Sopenharmony_ci			if (rec->generation != generation) {
1598c2ecf20Sopenharmony_ci				btrfs_debug(fs_info,
1608c2ecf20Sopenharmony_ci					    "generation mismatch for (%llu,%d,%llu) %llu != %llu",
1618c2ecf20Sopenharmony_ci					    key.objectid, key.type, key.offset,
1628c2ecf20Sopenharmony_ci					    rec->generation, generation);
1638c2ecf20Sopenharmony_ci			}
1648c2ecf20Sopenharmony_ci#endif
1658c2ecf20Sopenharmony_ci			if (rec->generation == generation &&
1668c2ecf20Sopenharmony_ci			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
1678c2ecf20Sopenharmony_ci			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
1688c2ecf20Sopenharmony_ci				reada_add_block(rc, bytenr, &next_key, n_gen);
1698c2ecf20Sopenharmony_ci		}
1708c2ecf20Sopenharmony_ci	}
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_cicleanup:
1738c2ecf20Sopenharmony_ci	/*
1748c2ecf20Sopenharmony_ci	 * free extctl records
1758c2ecf20Sopenharmony_ci	 */
1768c2ecf20Sopenharmony_ci	while (!list_empty(&list)) {
1778c2ecf20Sopenharmony_ci		struct reada_control *rc;
1788c2ecf20Sopenharmony_ci		struct reada_extctl *rec;
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci		rec = list_first_entry(&list, struct reada_extctl, list);
1818c2ecf20Sopenharmony_ci		list_del(&rec->list);
1828c2ecf20Sopenharmony_ci		rc = rec->rc;
1838c2ecf20Sopenharmony_ci		kfree(rec);
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci		kref_get(&rc->refcnt);
1868c2ecf20Sopenharmony_ci		if (atomic_dec_and_test(&rc->elems)) {
1878c2ecf20Sopenharmony_ci			kref_put(&rc->refcnt, reada_control_release);
1888c2ecf20Sopenharmony_ci			wake_up(&rc->wait);
1898c2ecf20Sopenharmony_ci		}
1908c2ecf20Sopenharmony_ci		kref_put(&rc->refcnt, reada_control_release);
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci		reada_extent_put(fs_info, re);	/* one ref for each entry */
1938c2ecf20Sopenharmony_ci	}
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	return;
1968c2ecf20Sopenharmony_ci}
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ciint btree_readahead_hook(struct extent_buffer *eb, int err)
1998c2ecf20Sopenharmony_ci{
2008c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = eb->fs_info;
2018c2ecf20Sopenharmony_ci	int ret = 0;
2028c2ecf20Sopenharmony_ci	struct reada_extent *re;
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci	/* find extent */
2058c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
2068c2ecf20Sopenharmony_ci	re = radix_tree_lookup(&fs_info->reada_tree,
2078c2ecf20Sopenharmony_ci			       eb->start >> PAGE_SHIFT);
2088c2ecf20Sopenharmony_ci	if (re)
2098c2ecf20Sopenharmony_ci		re->refcnt++;
2108c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
2118c2ecf20Sopenharmony_ci	if (!re) {
2128c2ecf20Sopenharmony_ci		ret = -1;
2138c2ecf20Sopenharmony_ci		goto start_machine;
2148c2ecf20Sopenharmony_ci	}
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	__readahead_hook(fs_info, re, eb, err);
2178c2ecf20Sopenharmony_ci	reada_extent_put(fs_info, re);	/* our ref */
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_cistart_machine:
2208c2ecf20Sopenharmony_ci	reada_start_machine(fs_info);
2218c2ecf20Sopenharmony_ci	return ret;
2228c2ecf20Sopenharmony_ci}
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_cistatic struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
2258c2ecf20Sopenharmony_ci					  struct btrfs_bio *bbio)
2268c2ecf20Sopenharmony_ci{
2278c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = dev->fs_info;
2288c2ecf20Sopenharmony_ci	int ret;
2298c2ecf20Sopenharmony_ci	struct reada_zone *zone;
2308c2ecf20Sopenharmony_ci	struct btrfs_block_group *cache = NULL;
2318c2ecf20Sopenharmony_ci	u64 start;
2328c2ecf20Sopenharmony_ci	u64 end;
2338c2ecf20Sopenharmony_ci	int i;
2348c2ecf20Sopenharmony_ci
2358c2ecf20Sopenharmony_ci	zone = NULL;
2368c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
2378c2ecf20Sopenharmony_ci	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
2388c2ecf20Sopenharmony_ci				     logical >> PAGE_SHIFT, 1);
2398c2ecf20Sopenharmony_ci	if (ret == 1 && logical >= zone->start && logical <= zone->end) {
2408c2ecf20Sopenharmony_ci		kref_get(&zone->refcnt);
2418c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
2428c2ecf20Sopenharmony_ci		return zone;
2438c2ecf20Sopenharmony_ci	}
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, logical);
2488c2ecf20Sopenharmony_ci	if (!cache)
2498c2ecf20Sopenharmony_ci		return NULL;
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	start = cache->start;
2528c2ecf20Sopenharmony_ci	end = start + cache->length - 1;
2538c2ecf20Sopenharmony_ci	btrfs_put_block_group(cache);
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci	zone = kzalloc(sizeof(*zone), GFP_KERNEL);
2568c2ecf20Sopenharmony_ci	if (!zone)
2578c2ecf20Sopenharmony_ci		return NULL;
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	ret = radix_tree_preload(GFP_KERNEL);
2608c2ecf20Sopenharmony_ci	if (ret) {
2618c2ecf20Sopenharmony_ci		kfree(zone);
2628c2ecf20Sopenharmony_ci		return NULL;
2638c2ecf20Sopenharmony_ci	}
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci	zone->start = start;
2668c2ecf20Sopenharmony_ci	zone->end = end;
2678c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&zone->list);
2688c2ecf20Sopenharmony_ci	spin_lock_init(&zone->lock);
2698c2ecf20Sopenharmony_ci	zone->locked = 0;
2708c2ecf20Sopenharmony_ci	kref_init(&zone->refcnt);
2718c2ecf20Sopenharmony_ci	zone->elems = 0;
2728c2ecf20Sopenharmony_ci	zone->device = dev; /* our device always sits at index 0 */
2738c2ecf20Sopenharmony_ci	for (i = 0; i < bbio->num_stripes; ++i) {
2748c2ecf20Sopenharmony_ci		/* bounds have already been checked */
2758c2ecf20Sopenharmony_ci		zone->devs[i] = bbio->stripes[i].dev;
2768c2ecf20Sopenharmony_ci	}
2778c2ecf20Sopenharmony_ci	zone->ndevs = bbio->num_stripes;
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
2808c2ecf20Sopenharmony_ci	ret = radix_tree_insert(&dev->reada_zones,
2818c2ecf20Sopenharmony_ci				(unsigned long)(zone->end >> PAGE_SHIFT),
2828c2ecf20Sopenharmony_ci				zone);
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	if (ret == -EEXIST) {
2858c2ecf20Sopenharmony_ci		kfree(zone);
2868c2ecf20Sopenharmony_ci		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
2878c2ecf20Sopenharmony_ci					     logical >> PAGE_SHIFT, 1);
2888c2ecf20Sopenharmony_ci		if (ret == 1 && logical >= zone->start && logical <= zone->end)
2898c2ecf20Sopenharmony_ci			kref_get(&zone->refcnt);
2908c2ecf20Sopenharmony_ci		else
2918c2ecf20Sopenharmony_ci			zone = NULL;
2928c2ecf20Sopenharmony_ci	}
2938c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
2948c2ecf20Sopenharmony_ci	radix_tree_preload_end();
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	return zone;
2978c2ecf20Sopenharmony_ci}
2988c2ecf20Sopenharmony_ci
2998c2ecf20Sopenharmony_cistatic struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
3008c2ecf20Sopenharmony_ci					      u64 logical,
3018c2ecf20Sopenharmony_ci					      struct btrfs_key *top)
3028c2ecf20Sopenharmony_ci{
3038c2ecf20Sopenharmony_ci	int ret;
3048c2ecf20Sopenharmony_ci	struct reada_extent *re = NULL;
3058c2ecf20Sopenharmony_ci	struct reada_extent *re_exist = NULL;
3068c2ecf20Sopenharmony_ci	struct btrfs_bio *bbio = NULL;
3078c2ecf20Sopenharmony_ci	struct btrfs_device *dev;
3088c2ecf20Sopenharmony_ci	struct btrfs_device *prev_dev;
3098c2ecf20Sopenharmony_ci	u64 length;
3108c2ecf20Sopenharmony_ci	int real_stripes;
3118c2ecf20Sopenharmony_ci	int nzones = 0;
3128c2ecf20Sopenharmony_ci	unsigned long index = logical >> PAGE_SHIFT;
3138c2ecf20Sopenharmony_ci	int dev_replace_is_ongoing;
3148c2ecf20Sopenharmony_ci	int have_zone = 0;
3158c2ecf20Sopenharmony_ci
3168c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
3178c2ecf20Sopenharmony_ci	re = radix_tree_lookup(&fs_info->reada_tree, index);
3188c2ecf20Sopenharmony_ci	if (re)
3198c2ecf20Sopenharmony_ci		re->refcnt++;
3208c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci	if (re)
3238c2ecf20Sopenharmony_ci		return re;
3248c2ecf20Sopenharmony_ci
3258c2ecf20Sopenharmony_ci	re = kzalloc(sizeof(*re), GFP_KERNEL);
3268c2ecf20Sopenharmony_ci	if (!re)
3278c2ecf20Sopenharmony_ci		return NULL;
3288c2ecf20Sopenharmony_ci
3298c2ecf20Sopenharmony_ci	re->logical = logical;
3308c2ecf20Sopenharmony_ci	re->top = *top;
3318c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&re->extctl);
3328c2ecf20Sopenharmony_ci	spin_lock_init(&re->lock);
3338c2ecf20Sopenharmony_ci	re->refcnt = 1;
3348c2ecf20Sopenharmony_ci
3358c2ecf20Sopenharmony_ci	/*
3368c2ecf20Sopenharmony_ci	 * map block
3378c2ecf20Sopenharmony_ci	 */
3388c2ecf20Sopenharmony_ci	length = fs_info->nodesize;
3398c2ecf20Sopenharmony_ci	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
3408c2ecf20Sopenharmony_ci			&length, &bbio, 0);
3418c2ecf20Sopenharmony_ci	if (ret || !bbio || length < fs_info->nodesize)
3428c2ecf20Sopenharmony_ci		goto error;
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
3458c2ecf20Sopenharmony_ci		btrfs_err(fs_info,
3468c2ecf20Sopenharmony_ci			   "readahead: more than %d copies not supported",
3478c2ecf20Sopenharmony_ci			   BTRFS_MAX_MIRRORS);
3488c2ecf20Sopenharmony_ci		goto error;
3498c2ecf20Sopenharmony_ci	}
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
3528c2ecf20Sopenharmony_ci	for (nzones = 0; nzones < real_stripes; ++nzones) {
3538c2ecf20Sopenharmony_ci		struct reada_zone *zone;
3548c2ecf20Sopenharmony_ci
3558c2ecf20Sopenharmony_ci		dev = bbio->stripes[nzones].dev;
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ci		/* cannot read ahead on missing device. */
3588c2ecf20Sopenharmony_ci		if (!dev->bdev)
3598c2ecf20Sopenharmony_ci			continue;
3608c2ecf20Sopenharmony_ci
3618c2ecf20Sopenharmony_ci		zone = reada_find_zone(dev, logical, bbio);
3628c2ecf20Sopenharmony_ci		if (!zone)
3638c2ecf20Sopenharmony_ci			continue;
3648c2ecf20Sopenharmony_ci
3658c2ecf20Sopenharmony_ci		re->zones[re->nzones++] = zone;
3668c2ecf20Sopenharmony_ci		spin_lock(&zone->lock);
3678c2ecf20Sopenharmony_ci		if (!zone->elems)
3688c2ecf20Sopenharmony_ci			kref_get(&zone->refcnt);
3698c2ecf20Sopenharmony_ci		++zone->elems;
3708c2ecf20Sopenharmony_ci		spin_unlock(&zone->lock);
3718c2ecf20Sopenharmony_ci		spin_lock(&fs_info->reada_lock);
3728c2ecf20Sopenharmony_ci		kref_put(&zone->refcnt, reada_zone_release);
3738c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
3748c2ecf20Sopenharmony_ci	}
3758c2ecf20Sopenharmony_ci	if (re->nzones == 0) {
3768c2ecf20Sopenharmony_ci		/* not a single zone found, error and out */
3778c2ecf20Sopenharmony_ci		goto error;
3788c2ecf20Sopenharmony_ci	}
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ci	/* Insert extent in reada tree + all per-device trees, all or nothing */
3818c2ecf20Sopenharmony_ci	down_read(&fs_info->dev_replace.rwsem);
3828c2ecf20Sopenharmony_ci	ret = radix_tree_preload(GFP_KERNEL);
3838c2ecf20Sopenharmony_ci	if (ret) {
3848c2ecf20Sopenharmony_ci		up_read(&fs_info->dev_replace.rwsem);
3858c2ecf20Sopenharmony_ci		goto error;
3868c2ecf20Sopenharmony_ci	}
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
3898c2ecf20Sopenharmony_ci	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
3908c2ecf20Sopenharmony_ci	if (ret == -EEXIST) {
3918c2ecf20Sopenharmony_ci		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
3928c2ecf20Sopenharmony_ci		re_exist->refcnt++;
3938c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
3948c2ecf20Sopenharmony_ci		radix_tree_preload_end();
3958c2ecf20Sopenharmony_ci		up_read(&fs_info->dev_replace.rwsem);
3968c2ecf20Sopenharmony_ci		goto error;
3978c2ecf20Sopenharmony_ci	}
3988c2ecf20Sopenharmony_ci	if (ret) {
3998c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
4008c2ecf20Sopenharmony_ci		radix_tree_preload_end();
4018c2ecf20Sopenharmony_ci		up_read(&fs_info->dev_replace.rwsem);
4028c2ecf20Sopenharmony_ci		goto error;
4038c2ecf20Sopenharmony_ci	}
4048c2ecf20Sopenharmony_ci	radix_tree_preload_end();
4058c2ecf20Sopenharmony_ci	prev_dev = NULL;
4068c2ecf20Sopenharmony_ci	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
4078c2ecf20Sopenharmony_ci			&fs_info->dev_replace);
4088c2ecf20Sopenharmony_ci	for (nzones = 0; nzones < re->nzones; ++nzones) {
4098c2ecf20Sopenharmony_ci		dev = re->zones[nzones]->device;
4108c2ecf20Sopenharmony_ci
4118c2ecf20Sopenharmony_ci		if (dev == prev_dev) {
4128c2ecf20Sopenharmony_ci			/*
4138c2ecf20Sopenharmony_ci			 * in case of DUP, just add the first zone. As both
4148c2ecf20Sopenharmony_ci			 * are on the same device, there's nothing to gain
4158c2ecf20Sopenharmony_ci			 * from adding both.
4168c2ecf20Sopenharmony_ci			 * Also, it wouldn't work, as the tree is per device
4178c2ecf20Sopenharmony_ci			 * and adding would fail with EEXIST
4188c2ecf20Sopenharmony_ci			 */
4198c2ecf20Sopenharmony_ci			continue;
4208c2ecf20Sopenharmony_ci		}
4218c2ecf20Sopenharmony_ci		if (!dev->bdev)
4228c2ecf20Sopenharmony_ci			continue;
4238c2ecf20Sopenharmony_ci
4248c2ecf20Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
4258c2ecf20Sopenharmony_ci			continue;
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci		if (dev_replace_is_ongoing &&
4288c2ecf20Sopenharmony_ci		    dev == fs_info->dev_replace.tgtdev) {
4298c2ecf20Sopenharmony_ci			/*
4308c2ecf20Sopenharmony_ci			 * as this device is selected for reading only as
4318c2ecf20Sopenharmony_ci			 * a last resort, skip it for read ahead.
4328c2ecf20Sopenharmony_ci			 */
4338c2ecf20Sopenharmony_ci			continue;
4348c2ecf20Sopenharmony_ci		}
4358c2ecf20Sopenharmony_ci		prev_dev = dev;
4368c2ecf20Sopenharmony_ci		ret = radix_tree_insert(&dev->reada_extents, index, re);
4378c2ecf20Sopenharmony_ci		if (ret) {
4388c2ecf20Sopenharmony_ci			while (--nzones >= 0) {
4398c2ecf20Sopenharmony_ci				dev = re->zones[nzones]->device;
4408c2ecf20Sopenharmony_ci				BUG_ON(dev == NULL);
4418c2ecf20Sopenharmony_ci				/* ignore whether the entry was inserted */
4428c2ecf20Sopenharmony_ci				radix_tree_delete(&dev->reada_extents, index);
4438c2ecf20Sopenharmony_ci			}
4448c2ecf20Sopenharmony_ci			radix_tree_delete(&fs_info->reada_tree, index);
4458c2ecf20Sopenharmony_ci			spin_unlock(&fs_info->reada_lock);
4468c2ecf20Sopenharmony_ci			up_read(&fs_info->dev_replace.rwsem);
4478c2ecf20Sopenharmony_ci			goto error;
4488c2ecf20Sopenharmony_ci		}
4498c2ecf20Sopenharmony_ci		have_zone = 1;
4508c2ecf20Sopenharmony_ci	}
4518c2ecf20Sopenharmony_ci	if (!have_zone)
4528c2ecf20Sopenharmony_ci		radix_tree_delete(&fs_info->reada_tree, index);
4538c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
4548c2ecf20Sopenharmony_ci	up_read(&fs_info->dev_replace.rwsem);
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	if (!have_zone)
4578c2ecf20Sopenharmony_ci		goto error;
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	btrfs_put_bbio(bbio);
4608c2ecf20Sopenharmony_ci	return re;
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_cierror:
4638c2ecf20Sopenharmony_ci	for (nzones = 0; nzones < re->nzones; ++nzones) {
4648c2ecf20Sopenharmony_ci		struct reada_zone *zone;
4658c2ecf20Sopenharmony_ci
4668c2ecf20Sopenharmony_ci		zone = re->zones[nzones];
4678c2ecf20Sopenharmony_ci		kref_get(&zone->refcnt);
4688c2ecf20Sopenharmony_ci		spin_lock(&zone->lock);
4698c2ecf20Sopenharmony_ci		--zone->elems;
4708c2ecf20Sopenharmony_ci		if (zone->elems == 0) {
4718c2ecf20Sopenharmony_ci			/*
4728c2ecf20Sopenharmony_ci			 * no fs_info->reada_lock needed, as this can't be
4738c2ecf20Sopenharmony_ci			 * the last ref
4748c2ecf20Sopenharmony_ci			 */
4758c2ecf20Sopenharmony_ci			kref_put(&zone->refcnt, reada_zone_release);
4768c2ecf20Sopenharmony_ci		}
4778c2ecf20Sopenharmony_ci		spin_unlock(&zone->lock);
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci		spin_lock(&fs_info->reada_lock);
4808c2ecf20Sopenharmony_ci		kref_put(&zone->refcnt, reada_zone_release);
4818c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
4828c2ecf20Sopenharmony_ci	}
4838c2ecf20Sopenharmony_ci	btrfs_put_bbio(bbio);
4848c2ecf20Sopenharmony_ci	kfree(re);
4858c2ecf20Sopenharmony_ci	return re_exist;
4868c2ecf20Sopenharmony_ci}
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_cistatic void reada_extent_put(struct btrfs_fs_info *fs_info,
4898c2ecf20Sopenharmony_ci			     struct reada_extent *re)
4908c2ecf20Sopenharmony_ci{
4918c2ecf20Sopenharmony_ci	int i;
4928c2ecf20Sopenharmony_ci	unsigned long index = re->logical >> PAGE_SHIFT;
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
4958c2ecf20Sopenharmony_ci	if (--re->refcnt) {
4968c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
4978c2ecf20Sopenharmony_ci		return;
4988c2ecf20Sopenharmony_ci	}
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	radix_tree_delete(&fs_info->reada_tree, index);
5018c2ecf20Sopenharmony_ci	for (i = 0; i < re->nzones; ++i) {
5028c2ecf20Sopenharmony_ci		struct reada_zone *zone = re->zones[i];
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_ci		radix_tree_delete(&zone->device->reada_extents, index);
5058c2ecf20Sopenharmony_ci	}
5068c2ecf20Sopenharmony_ci
5078c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
5088c2ecf20Sopenharmony_ci
5098c2ecf20Sopenharmony_ci	for (i = 0; i < re->nzones; ++i) {
5108c2ecf20Sopenharmony_ci		struct reada_zone *zone = re->zones[i];
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_ci		kref_get(&zone->refcnt);
5138c2ecf20Sopenharmony_ci		spin_lock(&zone->lock);
5148c2ecf20Sopenharmony_ci		--zone->elems;
5158c2ecf20Sopenharmony_ci		if (zone->elems == 0) {
5168c2ecf20Sopenharmony_ci			/* no fs_info->reada_lock needed, as this can't be
5178c2ecf20Sopenharmony_ci			 * the last ref */
5188c2ecf20Sopenharmony_ci			kref_put(&zone->refcnt, reada_zone_release);
5198c2ecf20Sopenharmony_ci		}
5208c2ecf20Sopenharmony_ci		spin_unlock(&zone->lock);
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci		spin_lock(&fs_info->reada_lock);
5238c2ecf20Sopenharmony_ci		kref_put(&zone->refcnt, reada_zone_release);
5248c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
5258c2ecf20Sopenharmony_ci	}
5268c2ecf20Sopenharmony_ci
5278c2ecf20Sopenharmony_ci	kfree(re);
5288c2ecf20Sopenharmony_ci}
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_cistatic void reada_zone_release(struct kref *kref)
5318c2ecf20Sopenharmony_ci{
5328c2ecf20Sopenharmony_ci	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
5338c2ecf20Sopenharmony_ci
5348c2ecf20Sopenharmony_ci	radix_tree_delete(&zone->device->reada_zones,
5358c2ecf20Sopenharmony_ci			  zone->end >> PAGE_SHIFT);
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci	kfree(zone);
5388c2ecf20Sopenharmony_ci}
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_cistatic void reada_control_release(struct kref *kref)
5418c2ecf20Sopenharmony_ci{
5428c2ecf20Sopenharmony_ci	struct reada_control *rc = container_of(kref, struct reada_control,
5438c2ecf20Sopenharmony_ci						refcnt);
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_ci	kfree(rc);
5468c2ecf20Sopenharmony_ci}
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_cistatic int reada_add_block(struct reada_control *rc, u64 logical,
5498c2ecf20Sopenharmony_ci			   struct btrfs_key *top, u64 generation)
5508c2ecf20Sopenharmony_ci{
5518c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = rc->fs_info;
5528c2ecf20Sopenharmony_ci	struct reada_extent *re;
5538c2ecf20Sopenharmony_ci	struct reada_extctl *rec;
5548c2ecf20Sopenharmony_ci
5558c2ecf20Sopenharmony_ci	/* takes one ref */
5568c2ecf20Sopenharmony_ci	re = reada_find_extent(fs_info, logical, top);
5578c2ecf20Sopenharmony_ci	if (!re)
5588c2ecf20Sopenharmony_ci		return -1;
5598c2ecf20Sopenharmony_ci
5608c2ecf20Sopenharmony_ci	rec = kzalloc(sizeof(*rec), GFP_KERNEL);
5618c2ecf20Sopenharmony_ci	if (!rec) {
5628c2ecf20Sopenharmony_ci		reada_extent_put(fs_info, re);
5638c2ecf20Sopenharmony_ci		return -ENOMEM;
5648c2ecf20Sopenharmony_ci	}
5658c2ecf20Sopenharmony_ci
5668c2ecf20Sopenharmony_ci	rec->rc = rc;
5678c2ecf20Sopenharmony_ci	rec->generation = generation;
5688c2ecf20Sopenharmony_ci	atomic_inc(&rc->elems);
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	spin_lock(&re->lock);
5718c2ecf20Sopenharmony_ci	list_add_tail(&rec->list, &re->extctl);
5728c2ecf20Sopenharmony_ci	spin_unlock(&re->lock);
5738c2ecf20Sopenharmony_ci
5748c2ecf20Sopenharmony_ci	/* leave the ref on the extent */
5758c2ecf20Sopenharmony_ci
5768c2ecf20Sopenharmony_ci	return 0;
5778c2ecf20Sopenharmony_ci}
5788c2ecf20Sopenharmony_ci
5798c2ecf20Sopenharmony_ci/*
5808c2ecf20Sopenharmony_ci * called with fs_info->reada_lock held
5818c2ecf20Sopenharmony_ci */
5828c2ecf20Sopenharmony_cistatic void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
5838c2ecf20Sopenharmony_ci{
5848c2ecf20Sopenharmony_ci	int i;
5858c2ecf20Sopenharmony_ci	unsigned long index = zone->end >> PAGE_SHIFT;
5868c2ecf20Sopenharmony_ci
5878c2ecf20Sopenharmony_ci	for (i = 0; i < zone->ndevs; ++i) {
5888c2ecf20Sopenharmony_ci		struct reada_zone *peer;
5898c2ecf20Sopenharmony_ci		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
5908c2ecf20Sopenharmony_ci		if (peer && peer->device != zone->device)
5918c2ecf20Sopenharmony_ci			peer->locked = lock;
5928c2ecf20Sopenharmony_ci	}
5938c2ecf20Sopenharmony_ci}
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci/*
5968c2ecf20Sopenharmony_ci * called with fs_info->reada_lock held
5978c2ecf20Sopenharmony_ci */
5988c2ecf20Sopenharmony_cistatic int reada_pick_zone(struct btrfs_device *dev)
5998c2ecf20Sopenharmony_ci{
6008c2ecf20Sopenharmony_ci	struct reada_zone *top_zone = NULL;
6018c2ecf20Sopenharmony_ci	struct reada_zone *top_locked_zone = NULL;
6028c2ecf20Sopenharmony_ci	u64 top_elems = 0;
6038c2ecf20Sopenharmony_ci	u64 top_locked_elems = 0;
6048c2ecf20Sopenharmony_ci	unsigned long index = 0;
6058c2ecf20Sopenharmony_ci	int ret;
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci	if (dev->reada_curr_zone) {
6088c2ecf20Sopenharmony_ci		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
6098c2ecf20Sopenharmony_ci		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
6108c2ecf20Sopenharmony_ci		dev->reada_curr_zone = NULL;
6118c2ecf20Sopenharmony_ci	}
6128c2ecf20Sopenharmony_ci	/* pick the zone with the most elements */
6138c2ecf20Sopenharmony_ci	while (1) {
6148c2ecf20Sopenharmony_ci		struct reada_zone *zone;
6158c2ecf20Sopenharmony_ci
6168c2ecf20Sopenharmony_ci		ret = radix_tree_gang_lookup(&dev->reada_zones,
6178c2ecf20Sopenharmony_ci					     (void **)&zone, index, 1);
6188c2ecf20Sopenharmony_ci		if (ret == 0)
6198c2ecf20Sopenharmony_ci			break;
6208c2ecf20Sopenharmony_ci		index = (zone->end >> PAGE_SHIFT) + 1;
6218c2ecf20Sopenharmony_ci		if (zone->locked) {
6228c2ecf20Sopenharmony_ci			if (zone->elems > top_locked_elems) {
6238c2ecf20Sopenharmony_ci				top_locked_elems = zone->elems;
6248c2ecf20Sopenharmony_ci				top_locked_zone = zone;
6258c2ecf20Sopenharmony_ci			}
6268c2ecf20Sopenharmony_ci		} else {
6278c2ecf20Sopenharmony_ci			if (zone->elems > top_elems) {
6288c2ecf20Sopenharmony_ci				top_elems = zone->elems;
6298c2ecf20Sopenharmony_ci				top_zone = zone;
6308c2ecf20Sopenharmony_ci			}
6318c2ecf20Sopenharmony_ci		}
6328c2ecf20Sopenharmony_ci	}
6338c2ecf20Sopenharmony_ci	if (top_zone)
6348c2ecf20Sopenharmony_ci		dev->reada_curr_zone = top_zone;
6358c2ecf20Sopenharmony_ci	else if (top_locked_zone)
6368c2ecf20Sopenharmony_ci		dev->reada_curr_zone = top_locked_zone;
6378c2ecf20Sopenharmony_ci	else
6388c2ecf20Sopenharmony_ci		return 0;
6398c2ecf20Sopenharmony_ci
6408c2ecf20Sopenharmony_ci	dev->reada_next = dev->reada_curr_zone->start;
6418c2ecf20Sopenharmony_ci	kref_get(&dev->reada_curr_zone->refcnt);
6428c2ecf20Sopenharmony_ci	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
6438c2ecf20Sopenharmony_ci
6448c2ecf20Sopenharmony_ci	return 1;
6458c2ecf20Sopenharmony_ci}
6468c2ecf20Sopenharmony_ci
6478c2ecf20Sopenharmony_cistatic int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
6488c2ecf20Sopenharmony_ci				    int mirror_num, struct extent_buffer **eb)
6498c2ecf20Sopenharmony_ci{
6508c2ecf20Sopenharmony_ci	struct extent_buffer *buf = NULL;
6518c2ecf20Sopenharmony_ci	int ret;
6528c2ecf20Sopenharmony_ci
6538c2ecf20Sopenharmony_ci	buf = btrfs_find_create_tree_block(fs_info, bytenr);
6548c2ecf20Sopenharmony_ci	if (IS_ERR(buf))
6558c2ecf20Sopenharmony_ci		return 0;
6568c2ecf20Sopenharmony_ci
6578c2ecf20Sopenharmony_ci	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_ci	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
6608c2ecf20Sopenharmony_ci	if (ret) {
6618c2ecf20Sopenharmony_ci		free_extent_buffer_stale(buf);
6628c2ecf20Sopenharmony_ci		return ret;
6638c2ecf20Sopenharmony_ci	}
6648c2ecf20Sopenharmony_ci
6658c2ecf20Sopenharmony_ci	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
6668c2ecf20Sopenharmony_ci		free_extent_buffer_stale(buf);
6678c2ecf20Sopenharmony_ci		return -EIO;
6688c2ecf20Sopenharmony_ci	} else if (extent_buffer_uptodate(buf)) {
6698c2ecf20Sopenharmony_ci		*eb = buf;
6708c2ecf20Sopenharmony_ci	} else {
6718c2ecf20Sopenharmony_ci		free_extent_buffer(buf);
6728c2ecf20Sopenharmony_ci	}
6738c2ecf20Sopenharmony_ci	return 0;
6748c2ecf20Sopenharmony_ci}
6758c2ecf20Sopenharmony_ci
6768c2ecf20Sopenharmony_cistatic int reada_start_machine_dev(struct btrfs_device *dev)
6778c2ecf20Sopenharmony_ci{
6788c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = dev->fs_info;
6798c2ecf20Sopenharmony_ci	struct reada_extent *re = NULL;
6808c2ecf20Sopenharmony_ci	int mirror_num = 0;
6818c2ecf20Sopenharmony_ci	struct extent_buffer *eb = NULL;
6828c2ecf20Sopenharmony_ci	u64 logical;
6838c2ecf20Sopenharmony_ci	int ret;
6848c2ecf20Sopenharmony_ci	int i;
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
6878c2ecf20Sopenharmony_ci	if (dev->reada_curr_zone == NULL) {
6888c2ecf20Sopenharmony_ci		ret = reada_pick_zone(dev);
6898c2ecf20Sopenharmony_ci		if (!ret) {
6908c2ecf20Sopenharmony_ci			spin_unlock(&fs_info->reada_lock);
6918c2ecf20Sopenharmony_ci			return 0;
6928c2ecf20Sopenharmony_ci		}
6938c2ecf20Sopenharmony_ci	}
6948c2ecf20Sopenharmony_ci	/*
6958c2ecf20Sopenharmony_ci	 * FIXME currently we issue the reads one extent at a time. If we have
6968c2ecf20Sopenharmony_ci	 * a contiguous block of extents, we could also coagulate them or use
6978c2ecf20Sopenharmony_ci	 * plugging to speed things up
6988c2ecf20Sopenharmony_ci	 */
6998c2ecf20Sopenharmony_ci	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
7008c2ecf20Sopenharmony_ci				     dev->reada_next >> PAGE_SHIFT, 1);
7018c2ecf20Sopenharmony_ci	if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
7028c2ecf20Sopenharmony_ci		ret = reada_pick_zone(dev);
7038c2ecf20Sopenharmony_ci		if (!ret) {
7048c2ecf20Sopenharmony_ci			spin_unlock(&fs_info->reada_lock);
7058c2ecf20Sopenharmony_ci			return 0;
7068c2ecf20Sopenharmony_ci		}
7078c2ecf20Sopenharmony_ci		re = NULL;
7088c2ecf20Sopenharmony_ci		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
7098c2ecf20Sopenharmony_ci					dev->reada_next >> PAGE_SHIFT, 1);
7108c2ecf20Sopenharmony_ci	}
7118c2ecf20Sopenharmony_ci	if (ret == 0) {
7128c2ecf20Sopenharmony_ci		spin_unlock(&fs_info->reada_lock);
7138c2ecf20Sopenharmony_ci		return 0;
7148c2ecf20Sopenharmony_ci	}
7158c2ecf20Sopenharmony_ci	dev->reada_next = re->logical + fs_info->nodesize;
7168c2ecf20Sopenharmony_ci	re->refcnt++;
7178c2ecf20Sopenharmony_ci
7188c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
7198c2ecf20Sopenharmony_ci
7208c2ecf20Sopenharmony_ci	spin_lock(&re->lock);
7218c2ecf20Sopenharmony_ci	if (re->scheduled || list_empty(&re->extctl)) {
7228c2ecf20Sopenharmony_ci		spin_unlock(&re->lock);
7238c2ecf20Sopenharmony_ci		reada_extent_put(fs_info, re);
7248c2ecf20Sopenharmony_ci		return 0;
7258c2ecf20Sopenharmony_ci	}
7268c2ecf20Sopenharmony_ci	re->scheduled = 1;
7278c2ecf20Sopenharmony_ci	spin_unlock(&re->lock);
7288c2ecf20Sopenharmony_ci
7298c2ecf20Sopenharmony_ci	/*
7308c2ecf20Sopenharmony_ci	 * find mirror num
7318c2ecf20Sopenharmony_ci	 */
7328c2ecf20Sopenharmony_ci	for (i = 0; i < re->nzones; ++i) {
7338c2ecf20Sopenharmony_ci		if (re->zones[i]->device == dev) {
7348c2ecf20Sopenharmony_ci			mirror_num = i + 1;
7358c2ecf20Sopenharmony_ci			break;
7368c2ecf20Sopenharmony_ci		}
7378c2ecf20Sopenharmony_ci	}
7388c2ecf20Sopenharmony_ci	logical = re->logical;
7398c2ecf20Sopenharmony_ci
7408c2ecf20Sopenharmony_ci	atomic_inc(&dev->reada_in_flight);
7418c2ecf20Sopenharmony_ci	ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
7428c2ecf20Sopenharmony_ci	if (ret)
7438c2ecf20Sopenharmony_ci		__readahead_hook(fs_info, re, NULL, ret);
7448c2ecf20Sopenharmony_ci	else if (eb)
7458c2ecf20Sopenharmony_ci		__readahead_hook(fs_info, re, eb, ret);
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci	if (eb)
7488c2ecf20Sopenharmony_ci		free_extent_buffer(eb);
7498c2ecf20Sopenharmony_ci
7508c2ecf20Sopenharmony_ci	atomic_dec(&dev->reada_in_flight);
7518c2ecf20Sopenharmony_ci	reada_extent_put(fs_info, re);
7528c2ecf20Sopenharmony_ci
7538c2ecf20Sopenharmony_ci	return 1;
7548c2ecf20Sopenharmony_ci
7558c2ecf20Sopenharmony_ci}
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_cistatic void reada_start_machine_worker(struct btrfs_work *work)
7588c2ecf20Sopenharmony_ci{
7598c2ecf20Sopenharmony_ci	struct reada_machine_work *rmw;
7608c2ecf20Sopenharmony_ci	int old_ioprio;
7618c2ecf20Sopenharmony_ci
7628c2ecf20Sopenharmony_ci	rmw = container_of(work, struct reada_machine_work, work);
7638c2ecf20Sopenharmony_ci
7648c2ecf20Sopenharmony_ci	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
7658c2ecf20Sopenharmony_ci				       task_nice_ioprio(current));
7668c2ecf20Sopenharmony_ci	set_task_ioprio(current, BTRFS_IOPRIO_READA);
7678c2ecf20Sopenharmony_ci	__reada_start_machine(rmw->fs_info);
7688c2ecf20Sopenharmony_ci	set_task_ioprio(current, old_ioprio);
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	atomic_dec(&rmw->fs_info->reada_works_cnt);
7718c2ecf20Sopenharmony_ci
7728c2ecf20Sopenharmony_ci	kfree(rmw);
7738c2ecf20Sopenharmony_ci}
7748c2ecf20Sopenharmony_ci
7758c2ecf20Sopenharmony_ci/* Try to start up to 10k READA requests for a group of devices */
7768c2ecf20Sopenharmony_cistatic int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
7778c2ecf20Sopenharmony_ci{
7788c2ecf20Sopenharmony_ci	u64 enqueued;
7798c2ecf20Sopenharmony_ci	u64 total = 0;
7808c2ecf20Sopenharmony_ci	struct btrfs_device *device;
7818c2ecf20Sopenharmony_ci
7828c2ecf20Sopenharmony_ci	do {
7838c2ecf20Sopenharmony_ci		enqueued = 0;
7848c2ecf20Sopenharmony_ci		list_for_each_entry(device, &fs_devices->devices, dev_list) {
7858c2ecf20Sopenharmony_ci			if (atomic_read(&device->reada_in_flight) <
7868c2ecf20Sopenharmony_ci			    MAX_IN_FLIGHT)
7878c2ecf20Sopenharmony_ci				enqueued += reada_start_machine_dev(device);
7888c2ecf20Sopenharmony_ci		}
7898c2ecf20Sopenharmony_ci		total += enqueued;
7908c2ecf20Sopenharmony_ci	} while (enqueued && total < 10000);
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci	return total;
7938c2ecf20Sopenharmony_ci}
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_cistatic void __reada_start_machine(struct btrfs_fs_info *fs_info)
7968c2ecf20Sopenharmony_ci{
7978c2ecf20Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7988c2ecf20Sopenharmony_ci	int i;
7998c2ecf20Sopenharmony_ci	u64 enqueued = 0;
8008c2ecf20Sopenharmony_ci
8018c2ecf20Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
8028c2ecf20Sopenharmony_ci
8038c2ecf20Sopenharmony_ci	enqueued += reada_start_for_fsdevs(fs_devices);
8048c2ecf20Sopenharmony_ci	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
8058c2ecf20Sopenharmony_ci		enqueued += reada_start_for_fsdevs(seed_devs);
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
8088c2ecf20Sopenharmony_ci	if (enqueued == 0)
8098c2ecf20Sopenharmony_ci		return;
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci	/*
8128c2ecf20Sopenharmony_ci	 * If everything is already in the cache, this is effectively single
8138c2ecf20Sopenharmony_ci	 * threaded. To a) not hold the caller for too long and b) to utilize
8148c2ecf20Sopenharmony_ci	 * more cores, we broke the loop above after 10000 iterations and now
8158c2ecf20Sopenharmony_ci	 * enqueue to workers to finish it. This will distribute the load to
8168c2ecf20Sopenharmony_ci	 * the cores.
8178c2ecf20Sopenharmony_ci	 */
8188c2ecf20Sopenharmony_ci	for (i = 0; i < 2; ++i) {
8198c2ecf20Sopenharmony_ci		reada_start_machine(fs_info);
8208c2ecf20Sopenharmony_ci		if (atomic_read(&fs_info->reada_works_cnt) >
8218c2ecf20Sopenharmony_ci		    BTRFS_MAX_MIRRORS * 2)
8228c2ecf20Sopenharmony_ci			break;
8238c2ecf20Sopenharmony_ci	}
8248c2ecf20Sopenharmony_ci}
8258c2ecf20Sopenharmony_ci
8268c2ecf20Sopenharmony_cistatic void reada_start_machine(struct btrfs_fs_info *fs_info)
8278c2ecf20Sopenharmony_ci{
8288c2ecf20Sopenharmony_ci	struct reada_machine_work *rmw;
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci	rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
8318c2ecf20Sopenharmony_ci	if (!rmw) {
8328c2ecf20Sopenharmony_ci		/* FIXME we cannot handle this properly right now */
8338c2ecf20Sopenharmony_ci		BUG();
8348c2ecf20Sopenharmony_ci	}
8358c2ecf20Sopenharmony_ci	btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
8368c2ecf20Sopenharmony_ci	rmw->fs_info = fs_info;
8378c2ecf20Sopenharmony_ci
8388c2ecf20Sopenharmony_ci	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
8398c2ecf20Sopenharmony_ci	atomic_inc(&fs_info->reada_works_cnt);
8408c2ecf20Sopenharmony_ci}
8418c2ecf20Sopenharmony_ci
8428c2ecf20Sopenharmony_ci#ifdef DEBUG
8438c2ecf20Sopenharmony_cistatic void dump_devs(struct btrfs_fs_info *fs_info, int all)
8448c2ecf20Sopenharmony_ci{
8458c2ecf20Sopenharmony_ci	struct btrfs_device *device;
8468c2ecf20Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
8478c2ecf20Sopenharmony_ci	unsigned long index;
8488c2ecf20Sopenharmony_ci	int ret;
8498c2ecf20Sopenharmony_ci	int i;
8508c2ecf20Sopenharmony_ci	int j;
8518c2ecf20Sopenharmony_ci	int cnt;
8528c2ecf20Sopenharmony_ci
8538c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
8548c2ecf20Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
8558c2ecf20Sopenharmony_ci		btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
8568c2ecf20Sopenharmony_ci			atomic_read(&device->reada_in_flight));
8578c2ecf20Sopenharmony_ci		index = 0;
8588c2ecf20Sopenharmony_ci		while (1) {
8598c2ecf20Sopenharmony_ci			struct reada_zone *zone;
8608c2ecf20Sopenharmony_ci			ret = radix_tree_gang_lookup(&device->reada_zones,
8618c2ecf20Sopenharmony_ci						     (void **)&zone, index, 1);
8628c2ecf20Sopenharmony_ci			if (ret == 0)
8638c2ecf20Sopenharmony_ci				break;
8648c2ecf20Sopenharmony_ci			pr_debug("  zone %llu-%llu elems %llu locked %d devs",
8658c2ecf20Sopenharmony_ci				    zone->start, zone->end, zone->elems,
8668c2ecf20Sopenharmony_ci				    zone->locked);
8678c2ecf20Sopenharmony_ci			for (j = 0; j < zone->ndevs; ++j) {
8688c2ecf20Sopenharmony_ci				pr_cont(" %lld",
8698c2ecf20Sopenharmony_ci					zone->devs[j]->devid);
8708c2ecf20Sopenharmony_ci			}
8718c2ecf20Sopenharmony_ci			if (device->reada_curr_zone == zone)
8728c2ecf20Sopenharmony_ci				pr_cont(" curr off %llu",
8738c2ecf20Sopenharmony_ci					device->reada_next - zone->start);
8748c2ecf20Sopenharmony_ci			pr_cont("\n");
8758c2ecf20Sopenharmony_ci			index = (zone->end >> PAGE_SHIFT) + 1;
8768c2ecf20Sopenharmony_ci		}
8778c2ecf20Sopenharmony_ci		cnt = 0;
8788c2ecf20Sopenharmony_ci		index = 0;
8798c2ecf20Sopenharmony_ci		while (all) {
8808c2ecf20Sopenharmony_ci			struct reada_extent *re = NULL;
8818c2ecf20Sopenharmony_ci
8828c2ecf20Sopenharmony_ci			ret = radix_tree_gang_lookup(&device->reada_extents,
8838c2ecf20Sopenharmony_ci						     (void **)&re, index, 1);
8848c2ecf20Sopenharmony_ci			if (ret == 0)
8858c2ecf20Sopenharmony_ci				break;
8868c2ecf20Sopenharmony_ci			pr_debug("  re: logical %llu size %u empty %d scheduled %d",
8878c2ecf20Sopenharmony_ci				re->logical, fs_info->nodesize,
8888c2ecf20Sopenharmony_ci				list_empty(&re->extctl), re->scheduled);
8898c2ecf20Sopenharmony_ci
8908c2ecf20Sopenharmony_ci			for (i = 0; i < re->nzones; ++i) {
8918c2ecf20Sopenharmony_ci				pr_cont(" zone %llu-%llu devs",
8928c2ecf20Sopenharmony_ci					re->zones[i]->start,
8938c2ecf20Sopenharmony_ci					re->zones[i]->end);
8948c2ecf20Sopenharmony_ci				for (j = 0; j < re->zones[i]->ndevs; ++j) {
8958c2ecf20Sopenharmony_ci					pr_cont(" %lld",
8968c2ecf20Sopenharmony_ci						re->zones[i]->devs[j]->devid);
8978c2ecf20Sopenharmony_ci				}
8988c2ecf20Sopenharmony_ci			}
8998c2ecf20Sopenharmony_ci			pr_cont("\n");
9008c2ecf20Sopenharmony_ci			index = (re->logical >> PAGE_SHIFT) + 1;
9018c2ecf20Sopenharmony_ci			if (++cnt > 15)
9028c2ecf20Sopenharmony_ci				break;
9038c2ecf20Sopenharmony_ci		}
9048c2ecf20Sopenharmony_ci	}
9058c2ecf20Sopenharmony_ci
9068c2ecf20Sopenharmony_ci	index = 0;
9078c2ecf20Sopenharmony_ci	cnt = 0;
9088c2ecf20Sopenharmony_ci	while (all) {
9098c2ecf20Sopenharmony_ci		struct reada_extent *re = NULL;
9108c2ecf20Sopenharmony_ci
9118c2ecf20Sopenharmony_ci		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
9128c2ecf20Sopenharmony_ci					     index, 1);
9138c2ecf20Sopenharmony_ci		if (ret == 0)
9148c2ecf20Sopenharmony_ci			break;
9158c2ecf20Sopenharmony_ci		if (!re->scheduled) {
9168c2ecf20Sopenharmony_ci			index = (re->logical >> PAGE_SHIFT) + 1;
9178c2ecf20Sopenharmony_ci			continue;
9188c2ecf20Sopenharmony_ci		}
9198c2ecf20Sopenharmony_ci		pr_debug("re: logical %llu size %u list empty %d scheduled %d",
9208c2ecf20Sopenharmony_ci			re->logical, fs_info->nodesize,
9218c2ecf20Sopenharmony_ci			list_empty(&re->extctl), re->scheduled);
9228c2ecf20Sopenharmony_ci		for (i = 0; i < re->nzones; ++i) {
9238c2ecf20Sopenharmony_ci			pr_cont(" zone %llu-%llu devs",
9248c2ecf20Sopenharmony_ci				re->zones[i]->start,
9258c2ecf20Sopenharmony_ci				re->zones[i]->end);
9268c2ecf20Sopenharmony_ci			for (j = 0; j < re->zones[i]->ndevs; ++j) {
9278c2ecf20Sopenharmony_ci				pr_cont(" %lld",
9288c2ecf20Sopenharmony_ci				       re->zones[i]->devs[j]->devid);
9298c2ecf20Sopenharmony_ci			}
9308c2ecf20Sopenharmony_ci		}
9318c2ecf20Sopenharmony_ci		pr_cont("\n");
9328c2ecf20Sopenharmony_ci		index = (re->logical >> PAGE_SHIFT) + 1;
9338c2ecf20Sopenharmony_ci	}
9348c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
9358c2ecf20Sopenharmony_ci}
9368c2ecf20Sopenharmony_ci#endif
9378c2ecf20Sopenharmony_ci
9388c2ecf20Sopenharmony_ci/*
9398c2ecf20Sopenharmony_ci * interface
9408c2ecf20Sopenharmony_ci */
9418c2ecf20Sopenharmony_cistruct reada_control *btrfs_reada_add(struct btrfs_root *root,
9428c2ecf20Sopenharmony_ci			struct btrfs_key *key_start, struct btrfs_key *key_end)
9438c2ecf20Sopenharmony_ci{
9448c2ecf20Sopenharmony_ci	struct reada_control *rc;
9458c2ecf20Sopenharmony_ci	u64 start;
9468c2ecf20Sopenharmony_ci	u64 generation;
9478c2ecf20Sopenharmony_ci	int ret;
9488c2ecf20Sopenharmony_ci	struct extent_buffer *node;
9498c2ecf20Sopenharmony_ci	static struct btrfs_key max_key = {
9508c2ecf20Sopenharmony_ci		.objectid = (u64)-1,
9518c2ecf20Sopenharmony_ci		.type = (u8)-1,
9528c2ecf20Sopenharmony_ci		.offset = (u64)-1
9538c2ecf20Sopenharmony_ci	};
9548c2ecf20Sopenharmony_ci
9558c2ecf20Sopenharmony_ci	rc = kzalloc(sizeof(*rc), GFP_KERNEL);
9568c2ecf20Sopenharmony_ci	if (!rc)
9578c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
9588c2ecf20Sopenharmony_ci
9598c2ecf20Sopenharmony_ci	rc->fs_info = root->fs_info;
9608c2ecf20Sopenharmony_ci	rc->key_start = *key_start;
9618c2ecf20Sopenharmony_ci	rc->key_end = *key_end;
9628c2ecf20Sopenharmony_ci	atomic_set(&rc->elems, 0);
9638c2ecf20Sopenharmony_ci	init_waitqueue_head(&rc->wait);
9648c2ecf20Sopenharmony_ci	kref_init(&rc->refcnt);
9658c2ecf20Sopenharmony_ci	kref_get(&rc->refcnt); /* one ref for having elements */
9668c2ecf20Sopenharmony_ci
9678c2ecf20Sopenharmony_ci	node = btrfs_root_node(root);
9688c2ecf20Sopenharmony_ci	start = node->start;
9698c2ecf20Sopenharmony_ci	generation = btrfs_header_generation(node);
9708c2ecf20Sopenharmony_ci	free_extent_buffer(node);
9718c2ecf20Sopenharmony_ci
9728c2ecf20Sopenharmony_ci	ret = reada_add_block(rc, start, &max_key, generation);
9738c2ecf20Sopenharmony_ci	if (ret) {
9748c2ecf20Sopenharmony_ci		kfree(rc);
9758c2ecf20Sopenharmony_ci		return ERR_PTR(ret);
9768c2ecf20Sopenharmony_ci	}
9778c2ecf20Sopenharmony_ci
9788c2ecf20Sopenharmony_ci	reada_start_machine(root->fs_info);
9798c2ecf20Sopenharmony_ci
9808c2ecf20Sopenharmony_ci	return rc;
9818c2ecf20Sopenharmony_ci}
9828c2ecf20Sopenharmony_ci
9838c2ecf20Sopenharmony_ci#ifdef DEBUG
9848c2ecf20Sopenharmony_ciint btrfs_reada_wait(void *handle)
9858c2ecf20Sopenharmony_ci{
9868c2ecf20Sopenharmony_ci	struct reada_control *rc = handle;
9878c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = rc->fs_info;
9888c2ecf20Sopenharmony_ci
9898c2ecf20Sopenharmony_ci	while (atomic_read(&rc->elems)) {
9908c2ecf20Sopenharmony_ci		if (!atomic_read(&fs_info->reada_works_cnt))
9918c2ecf20Sopenharmony_ci			reada_start_machine(fs_info);
9928c2ecf20Sopenharmony_ci		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
9938c2ecf20Sopenharmony_ci				   5 * HZ);
9948c2ecf20Sopenharmony_ci		dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
9958c2ecf20Sopenharmony_ci	}
9968c2ecf20Sopenharmony_ci
9978c2ecf20Sopenharmony_ci	dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
9988c2ecf20Sopenharmony_ci
9998c2ecf20Sopenharmony_ci	kref_put(&rc->refcnt, reada_control_release);
10008c2ecf20Sopenharmony_ci
10018c2ecf20Sopenharmony_ci	return 0;
10028c2ecf20Sopenharmony_ci}
10038c2ecf20Sopenharmony_ci#else
10048c2ecf20Sopenharmony_ciint btrfs_reada_wait(void *handle)
10058c2ecf20Sopenharmony_ci{
10068c2ecf20Sopenharmony_ci	struct reada_control *rc = handle;
10078c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = rc->fs_info;
10088c2ecf20Sopenharmony_ci
10098c2ecf20Sopenharmony_ci	while (atomic_read(&rc->elems)) {
10108c2ecf20Sopenharmony_ci		if (!atomic_read(&fs_info->reada_works_cnt))
10118c2ecf20Sopenharmony_ci			reada_start_machine(fs_info);
10128c2ecf20Sopenharmony_ci		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
10138c2ecf20Sopenharmony_ci				   (HZ + 9) / 10);
10148c2ecf20Sopenharmony_ci	}
10158c2ecf20Sopenharmony_ci
10168c2ecf20Sopenharmony_ci	kref_put(&rc->refcnt, reada_control_release);
10178c2ecf20Sopenharmony_ci
10188c2ecf20Sopenharmony_ci	return 0;
10198c2ecf20Sopenharmony_ci}
10208c2ecf20Sopenharmony_ci#endif
10218c2ecf20Sopenharmony_ci
10228c2ecf20Sopenharmony_civoid btrfs_reada_detach(void *handle)
10238c2ecf20Sopenharmony_ci{
10248c2ecf20Sopenharmony_ci	struct reada_control *rc = handle;
10258c2ecf20Sopenharmony_ci
10268c2ecf20Sopenharmony_ci	kref_put(&rc->refcnt, reada_control_release);
10278c2ecf20Sopenharmony_ci}
10288c2ecf20Sopenharmony_ci
10298c2ecf20Sopenharmony_ci/*
10308c2ecf20Sopenharmony_ci * Before removing a device (device replace or device remove ioctls), call this
10318c2ecf20Sopenharmony_ci * function to wait for all existing readahead requests on the device and to
10328c2ecf20Sopenharmony_ci * make sure no one queues more readahead requests for the device.
10338c2ecf20Sopenharmony_ci *
10348c2ecf20Sopenharmony_ci * Must be called without holding neither the device list mutex nor the device
10358c2ecf20Sopenharmony_ci * replace semaphore, otherwise it will deadlock.
10368c2ecf20Sopenharmony_ci */
10378c2ecf20Sopenharmony_civoid btrfs_reada_remove_dev(struct btrfs_device *dev)
10388c2ecf20Sopenharmony_ci{
10398c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = dev->fs_info;
10408c2ecf20Sopenharmony_ci
10418c2ecf20Sopenharmony_ci	/* Serialize with readahead extent creation at reada_find_extent(). */
10428c2ecf20Sopenharmony_ci	spin_lock(&fs_info->reada_lock);
10438c2ecf20Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
10448c2ecf20Sopenharmony_ci	spin_unlock(&fs_info->reada_lock);
10458c2ecf20Sopenharmony_ci
10468c2ecf20Sopenharmony_ci	/*
10478c2ecf20Sopenharmony_ci	 * There might be readahead requests added to the radix trees which
10488c2ecf20Sopenharmony_ci	 * were not yet added to the readahead work queue. We need to start
10498c2ecf20Sopenharmony_ci	 * them and wait for their completion, otherwise we can end up with
10508c2ecf20Sopenharmony_ci	 * use-after-free problems when dropping the last reference on the
10518c2ecf20Sopenharmony_ci	 * readahead extents and their zones, as they need to access the
10528c2ecf20Sopenharmony_ci	 * device structure.
10538c2ecf20Sopenharmony_ci	 */
10548c2ecf20Sopenharmony_ci	reada_start_machine(fs_info);
10558c2ecf20Sopenharmony_ci	btrfs_flush_workqueue(fs_info->readahead_workers);
10568c2ecf20Sopenharmony_ci}
10578c2ecf20Sopenharmony_ci
10588c2ecf20Sopenharmony_ci/*
10598c2ecf20Sopenharmony_ci * If when removing a device (device replace or device remove ioctls) an error
10608c2ecf20Sopenharmony_ci * happens after calling btrfs_reada_remove_dev(), call this to undo what that
10618c2ecf20Sopenharmony_ci * function did. This is safe to call even if btrfs_reada_remove_dev() was not
10628c2ecf20Sopenharmony_ci * called before.
10638c2ecf20Sopenharmony_ci */
10648c2ecf20Sopenharmony_civoid btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
10658c2ecf20Sopenharmony_ci{
10668c2ecf20Sopenharmony_ci	spin_lock(&dev->fs_info->reada_lock);
10678c2ecf20Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
10688c2ecf20Sopenharmony_ci	spin_unlock(&dev->fs_info->reada_lock);
10698c2ecf20Sopenharmony_ci}
1070