162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (c) 2000-2006 Silicon Graphics, Inc.
462306a36Sopenharmony_ci * All Rights Reserved.
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci#include "xfs.h"
762306a36Sopenharmony_ci#include <linux/backing-dev.h>
862306a36Sopenharmony_ci#include <linux/dax.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include "xfs_shared.h"
1162306a36Sopenharmony_ci#include "xfs_format.h"
1262306a36Sopenharmony_ci#include "xfs_log_format.h"
1362306a36Sopenharmony_ci#include "xfs_trans_resv.h"
1462306a36Sopenharmony_ci#include "xfs_mount.h"
1562306a36Sopenharmony_ci#include "xfs_trace.h"
1662306a36Sopenharmony_ci#include "xfs_log.h"
1762306a36Sopenharmony_ci#include "xfs_log_recover.h"
1862306a36Sopenharmony_ci#include "xfs_log_priv.h"
1962306a36Sopenharmony_ci#include "xfs_trans.h"
2062306a36Sopenharmony_ci#include "xfs_buf_item.h"
2162306a36Sopenharmony_ci#include "xfs_errortag.h"
2262306a36Sopenharmony_ci#include "xfs_error.h"
2362306a36Sopenharmony_ci#include "xfs_ag.h"
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_cistruct kmem_cache *xfs_buf_cache;
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci/*
2862306a36Sopenharmony_ci * Locking orders
2962306a36Sopenharmony_ci *
3062306a36Sopenharmony_ci * xfs_buf_ioacct_inc:
3162306a36Sopenharmony_ci * xfs_buf_ioacct_dec:
3262306a36Sopenharmony_ci *	b_sema (caller holds)
3362306a36Sopenharmony_ci *	  b_lock
3462306a36Sopenharmony_ci *
3562306a36Sopenharmony_ci * xfs_buf_stale:
3662306a36Sopenharmony_ci *	b_sema (caller holds)
3762306a36Sopenharmony_ci *	  b_lock
3862306a36Sopenharmony_ci *	    lru_lock
3962306a36Sopenharmony_ci *
4062306a36Sopenharmony_ci * xfs_buf_rele:
4162306a36Sopenharmony_ci *	b_lock
4262306a36Sopenharmony_ci *	  pag_buf_lock
4362306a36Sopenharmony_ci *	    lru_lock
4462306a36Sopenharmony_ci *
4562306a36Sopenharmony_ci * xfs_buftarg_drain_rele
4662306a36Sopenharmony_ci *	lru_lock
4762306a36Sopenharmony_ci *	  b_lock (trylock due to inversion)
4862306a36Sopenharmony_ci *
4962306a36Sopenharmony_ci * xfs_buftarg_isolate
5062306a36Sopenharmony_ci *	lru_lock
5162306a36Sopenharmony_ci *	  b_lock (trylock due to inversion)
5262306a36Sopenharmony_ci */
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_cistatic int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_cistatic inline int
5762306a36Sopenharmony_cixfs_buf_submit(
5862306a36Sopenharmony_ci	struct xfs_buf		*bp)
5962306a36Sopenharmony_ci{
6062306a36Sopenharmony_ci	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
6162306a36Sopenharmony_ci}
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_cistatic inline int
6462306a36Sopenharmony_cixfs_buf_is_vmapped(
6562306a36Sopenharmony_ci	struct xfs_buf	*bp)
6662306a36Sopenharmony_ci{
6762306a36Sopenharmony_ci	/*
6862306a36Sopenharmony_ci	 * Return true if the buffer is vmapped.
6962306a36Sopenharmony_ci	 *
7062306a36Sopenharmony_ci	 * b_addr is null if the buffer is not mapped, but the code is clever
7162306a36Sopenharmony_ci	 * enough to know it doesn't have to map a single page, so the check has
7262306a36Sopenharmony_ci	 * to be both for b_addr and bp->b_page_count > 1.
7362306a36Sopenharmony_ci	 */
7462306a36Sopenharmony_ci	return bp->b_addr && bp->b_page_count > 1;
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_cistatic inline int
7862306a36Sopenharmony_cixfs_buf_vmap_len(
7962306a36Sopenharmony_ci	struct xfs_buf	*bp)
8062306a36Sopenharmony_ci{
8162306a36Sopenharmony_ci	return (bp->b_page_count * PAGE_SIZE);
8262306a36Sopenharmony_ci}
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci/*
8562306a36Sopenharmony_ci * Bump the I/O in flight count on the buftarg if we haven't yet done so for
8662306a36Sopenharmony_ci * this buffer. The count is incremented once per buffer (per hold cycle)
8762306a36Sopenharmony_ci * because the corresponding decrement is deferred to buffer release. Buffers
8862306a36Sopenharmony_ci * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
8962306a36Sopenharmony_ci * tracking adds unnecessary overhead. This is used for sychronization purposes
9062306a36Sopenharmony_ci * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
9162306a36Sopenharmony_ci * in-flight buffers.
9262306a36Sopenharmony_ci *
9362306a36Sopenharmony_ci * Buffers that are never released (e.g., superblock, iclog buffers) must set
9462306a36Sopenharmony_ci * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
9562306a36Sopenharmony_ci * never reaches zero and unmount hangs indefinitely.
9662306a36Sopenharmony_ci */
9762306a36Sopenharmony_cistatic inline void
9862306a36Sopenharmony_cixfs_buf_ioacct_inc(
9962306a36Sopenharmony_ci	struct xfs_buf	*bp)
10062306a36Sopenharmony_ci{
10162306a36Sopenharmony_ci	if (bp->b_flags & XBF_NO_IOACCT)
10262306a36Sopenharmony_ci		return;
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	ASSERT(bp->b_flags & XBF_ASYNC);
10562306a36Sopenharmony_ci	spin_lock(&bp->b_lock);
10662306a36Sopenharmony_ci	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
10762306a36Sopenharmony_ci		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
10862306a36Sopenharmony_ci		percpu_counter_inc(&bp->b_target->bt_io_count);
10962306a36Sopenharmony_ci	}
11062306a36Sopenharmony_ci	spin_unlock(&bp->b_lock);
11162306a36Sopenharmony_ci}
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci/*
11462306a36Sopenharmony_ci * Clear the in-flight state on a buffer about to be released to the LRU or
11562306a36Sopenharmony_ci * freed and unaccount from the buftarg.
11662306a36Sopenharmony_ci */
11762306a36Sopenharmony_cistatic inline void
11862306a36Sopenharmony_ci__xfs_buf_ioacct_dec(
11962306a36Sopenharmony_ci	struct xfs_buf	*bp)
12062306a36Sopenharmony_ci{
12162306a36Sopenharmony_ci	lockdep_assert_held(&bp->b_lock);
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
12462306a36Sopenharmony_ci		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
12562306a36Sopenharmony_ci		percpu_counter_dec(&bp->b_target->bt_io_count);
12662306a36Sopenharmony_ci	}
12762306a36Sopenharmony_ci}
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_cistatic inline void
13062306a36Sopenharmony_cixfs_buf_ioacct_dec(
13162306a36Sopenharmony_ci	struct xfs_buf	*bp)
13262306a36Sopenharmony_ci{
13362306a36Sopenharmony_ci	spin_lock(&bp->b_lock);
13462306a36Sopenharmony_ci	__xfs_buf_ioacct_dec(bp);
13562306a36Sopenharmony_ci	spin_unlock(&bp->b_lock);
13662306a36Sopenharmony_ci}
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci/*
13962306a36Sopenharmony_ci * When we mark a buffer stale, we remove the buffer from the LRU and clear the
14062306a36Sopenharmony_ci * b_lru_ref count so that the buffer is freed immediately when the buffer
14162306a36Sopenharmony_ci * reference count falls to zero. If the buffer is already on the LRU, we need
14262306a36Sopenharmony_ci * to remove the reference that LRU holds on the buffer.
14362306a36Sopenharmony_ci *
14462306a36Sopenharmony_ci * This prevents build-up of stale buffers on the LRU.
14562306a36Sopenharmony_ci */
14662306a36Sopenharmony_civoid
14762306a36Sopenharmony_cixfs_buf_stale(
14862306a36Sopenharmony_ci	struct xfs_buf	*bp)
14962306a36Sopenharmony_ci{
15062306a36Sopenharmony_ci	ASSERT(xfs_buf_islocked(bp));
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	bp->b_flags |= XBF_STALE;
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	/*
15562306a36Sopenharmony_ci	 * Clear the delwri status so that a delwri queue walker will not
15662306a36Sopenharmony_ci	 * flush this buffer to disk now that it is stale. The delwri queue has
15762306a36Sopenharmony_ci	 * a reference to the buffer, so this is safe to do.
15862306a36Sopenharmony_ci	 */
15962306a36Sopenharmony_ci	bp->b_flags &= ~_XBF_DELWRI_Q;
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	/*
16262306a36Sopenharmony_ci	 * Once the buffer is marked stale and unlocked, a subsequent lookup
16362306a36Sopenharmony_ci	 * could reset b_flags. There is no guarantee that the buffer is
16462306a36Sopenharmony_ci	 * unaccounted (released to LRU) before that occurs. Drop in-flight
16562306a36Sopenharmony_ci	 * status now to preserve accounting consistency.
16662306a36Sopenharmony_ci	 */
16762306a36Sopenharmony_ci	spin_lock(&bp->b_lock);
16862306a36Sopenharmony_ci	__xfs_buf_ioacct_dec(bp);
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	atomic_set(&bp->b_lru_ref, 0);
17162306a36Sopenharmony_ci	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
17262306a36Sopenharmony_ci	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
17362306a36Sopenharmony_ci		atomic_dec(&bp->b_hold);
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	ASSERT(atomic_read(&bp->b_hold) >= 1);
17662306a36Sopenharmony_ci	spin_unlock(&bp->b_lock);
17762306a36Sopenharmony_ci}
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_cistatic int
18062306a36Sopenharmony_cixfs_buf_get_maps(
18162306a36Sopenharmony_ci	struct xfs_buf		*bp,
18262306a36Sopenharmony_ci	int			map_count)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	ASSERT(bp->b_maps == NULL);
18562306a36Sopenharmony_ci	bp->b_map_count = map_count;
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci	if (map_count == 1) {
18862306a36Sopenharmony_ci		bp->b_maps = &bp->__b_map;
18962306a36Sopenharmony_ci		return 0;
19062306a36Sopenharmony_ci	}
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
19362306a36Sopenharmony_ci				KM_NOFS);
19462306a36Sopenharmony_ci	if (!bp->b_maps)
19562306a36Sopenharmony_ci		return -ENOMEM;
19662306a36Sopenharmony_ci	return 0;
19762306a36Sopenharmony_ci}
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci/*
20062306a36Sopenharmony_ci *	Frees b_pages if it was allocated.
20162306a36Sopenharmony_ci */
20262306a36Sopenharmony_cistatic void
20362306a36Sopenharmony_cixfs_buf_free_maps(
20462306a36Sopenharmony_ci	struct xfs_buf	*bp)
20562306a36Sopenharmony_ci{
20662306a36Sopenharmony_ci	if (bp->b_maps != &bp->__b_map) {
20762306a36Sopenharmony_ci		kmem_free(bp->b_maps);
20862306a36Sopenharmony_ci		bp->b_maps = NULL;
20962306a36Sopenharmony_ci	}
21062306a36Sopenharmony_ci}
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_cistatic int
21362306a36Sopenharmony_ci_xfs_buf_alloc(
21462306a36Sopenharmony_ci	struct xfs_buftarg	*target,
21562306a36Sopenharmony_ci	struct xfs_buf_map	*map,
21662306a36Sopenharmony_ci	int			nmaps,
21762306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
21862306a36Sopenharmony_ci	struct xfs_buf		**bpp)
21962306a36Sopenharmony_ci{
22062306a36Sopenharmony_ci	struct xfs_buf		*bp;
22162306a36Sopenharmony_ci	int			error;
22262306a36Sopenharmony_ci	int			i;
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	*bpp = NULL;
22562306a36Sopenharmony_ci	bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	/*
22862306a36Sopenharmony_ci	 * We don't want certain flags to appear in b_flags unless they are
22962306a36Sopenharmony_ci	 * specifically set by later operations on the buffer.
23062306a36Sopenharmony_ci	 */
23162306a36Sopenharmony_ci	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	atomic_set(&bp->b_hold, 1);
23462306a36Sopenharmony_ci	atomic_set(&bp->b_lru_ref, 1);
23562306a36Sopenharmony_ci	init_completion(&bp->b_iowait);
23662306a36Sopenharmony_ci	INIT_LIST_HEAD(&bp->b_lru);
23762306a36Sopenharmony_ci	INIT_LIST_HEAD(&bp->b_list);
23862306a36Sopenharmony_ci	INIT_LIST_HEAD(&bp->b_li_list);
23962306a36Sopenharmony_ci	sema_init(&bp->b_sema, 0); /* held, no waiters */
24062306a36Sopenharmony_ci	spin_lock_init(&bp->b_lock);
24162306a36Sopenharmony_ci	bp->b_target = target;
24262306a36Sopenharmony_ci	bp->b_mount = target->bt_mount;
24362306a36Sopenharmony_ci	bp->b_flags = flags;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	/*
24662306a36Sopenharmony_ci	 * Set length and io_length to the same value initially.
24762306a36Sopenharmony_ci	 * I/O routines should use io_length, which will be the same in
24862306a36Sopenharmony_ci	 * most cases but may be reset (e.g. XFS recovery).
24962306a36Sopenharmony_ci	 */
25062306a36Sopenharmony_ci	error = xfs_buf_get_maps(bp, nmaps);
25162306a36Sopenharmony_ci	if (error)  {
25262306a36Sopenharmony_ci		kmem_cache_free(xfs_buf_cache, bp);
25362306a36Sopenharmony_ci		return error;
25462306a36Sopenharmony_ci	}
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	bp->b_rhash_key = map[0].bm_bn;
25762306a36Sopenharmony_ci	bp->b_length = 0;
25862306a36Sopenharmony_ci	for (i = 0; i < nmaps; i++) {
25962306a36Sopenharmony_ci		bp->b_maps[i].bm_bn = map[i].bm_bn;
26062306a36Sopenharmony_ci		bp->b_maps[i].bm_len = map[i].bm_len;
26162306a36Sopenharmony_ci		bp->b_length += map[i].bm_len;
26262306a36Sopenharmony_ci	}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	atomic_set(&bp->b_pin_count, 0);
26562306a36Sopenharmony_ci	init_waitqueue_head(&bp->b_waiters);
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	XFS_STATS_INC(bp->b_mount, xb_create);
26862306a36Sopenharmony_ci	trace_xfs_buf_init(bp, _RET_IP_);
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	*bpp = bp;
27162306a36Sopenharmony_ci	return 0;
27262306a36Sopenharmony_ci}
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_cistatic void
27562306a36Sopenharmony_cixfs_buf_free_pages(
27662306a36Sopenharmony_ci	struct xfs_buf	*bp)
27762306a36Sopenharmony_ci{
27862306a36Sopenharmony_ci	uint		i;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	ASSERT(bp->b_flags & _XBF_PAGES);
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	if (xfs_buf_is_vmapped(bp))
28362306a36Sopenharmony_ci		vm_unmap_ram(bp->b_addr, bp->b_page_count);
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci	for (i = 0; i < bp->b_page_count; i++) {
28662306a36Sopenharmony_ci		if (bp->b_pages[i])
28762306a36Sopenharmony_ci			__free_page(bp->b_pages[i]);
28862306a36Sopenharmony_ci	}
28962306a36Sopenharmony_ci	mm_account_reclaimed_pages(bp->b_page_count);
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	if (bp->b_pages != bp->b_page_array)
29262306a36Sopenharmony_ci		kmem_free(bp->b_pages);
29362306a36Sopenharmony_ci	bp->b_pages = NULL;
29462306a36Sopenharmony_ci	bp->b_flags &= ~_XBF_PAGES;
29562306a36Sopenharmony_ci}
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_cistatic void
29862306a36Sopenharmony_cixfs_buf_free_callback(
29962306a36Sopenharmony_ci	struct callback_head	*cb)
30062306a36Sopenharmony_ci{
30162306a36Sopenharmony_ci	struct xfs_buf		*bp = container_of(cb, struct xfs_buf, b_rcu);
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	xfs_buf_free_maps(bp);
30462306a36Sopenharmony_ci	kmem_cache_free(xfs_buf_cache, bp);
30562306a36Sopenharmony_ci}
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_cistatic void
30862306a36Sopenharmony_cixfs_buf_free(
30962306a36Sopenharmony_ci	struct xfs_buf		*bp)
31062306a36Sopenharmony_ci{
31162306a36Sopenharmony_ci	trace_xfs_buf_free(bp, _RET_IP_);
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	ASSERT(list_empty(&bp->b_lru));
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	if (bp->b_flags & _XBF_PAGES)
31662306a36Sopenharmony_ci		xfs_buf_free_pages(bp);
31762306a36Sopenharmony_ci	else if (bp->b_flags & _XBF_KMEM)
31862306a36Sopenharmony_ci		kmem_free(bp->b_addr);
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
32162306a36Sopenharmony_ci}
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_cistatic int
32462306a36Sopenharmony_cixfs_buf_alloc_kmem(
32562306a36Sopenharmony_ci	struct xfs_buf	*bp,
32662306a36Sopenharmony_ci	xfs_buf_flags_t	flags)
32762306a36Sopenharmony_ci{
32862306a36Sopenharmony_ci	xfs_km_flags_t	kmflag_mask = KM_NOFS;
32962306a36Sopenharmony_ci	size_t		size = BBTOB(bp->b_length);
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	/* Assure zeroed buffer for non-read cases. */
33262306a36Sopenharmony_ci	if (!(flags & XBF_READ))
33362306a36Sopenharmony_ci		kmflag_mask |= KM_ZERO;
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	bp->b_addr = kmem_alloc(size, kmflag_mask);
33662306a36Sopenharmony_ci	if (!bp->b_addr)
33762306a36Sopenharmony_ci		return -ENOMEM;
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
34062306a36Sopenharmony_ci	    ((unsigned long)bp->b_addr & PAGE_MASK)) {
34162306a36Sopenharmony_ci		/* b_addr spans two pages - use alloc_page instead */
34262306a36Sopenharmony_ci		kmem_free(bp->b_addr);
34362306a36Sopenharmony_ci		bp->b_addr = NULL;
34462306a36Sopenharmony_ci		return -ENOMEM;
34562306a36Sopenharmony_ci	}
34662306a36Sopenharmony_ci	bp->b_offset = offset_in_page(bp->b_addr);
34762306a36Sopenharmony_ci	bp->b_pages = bp->b_page_array;
34862306a36Sopenharmony_ci	bp->b_pages[0] = kmem_to_page(bp->b_addr);
34962306a36Sopenharmony_ci	bp->b_page_count = 1;
35062306a36Sopenharmony_ci	bp->b_flags |= _XBF_KMEM;
35162306a36Sopenharmony_ci	return 0;
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_cistatic int
35562306a36Sopenharmony_cixfs_buf_alloc_pages(
35662306a36Sopenharmony_ci	struct xfs_buf	*bp,
35762306a36Sopenharmony_ci	xfs_buf_flags_t	flags)
35862306a36Sopenharmony_ci{
35962306a36Sopenharmony_ci	gfp_t		gfp_mask = __GFP_NOWARN;
36062306a36Sopenharmony_ci	long		filled = 0;
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci	if (flags & XBF_READ_AHEAD)
36362306a36Sopenharmony_ci		gfp_mask |= __GFP_NORETRY;
36462306a36Sopenharmony_ci	else
36562306a36Sopenharmony_ci		gfp_mask |= GFP_NOFS;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	/* Make sure that we have a page list */
36862306a36Sopenharmony_ci	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
36962306a36Sopenharmony_ci	if (bp->b_page_count <= XB_PAGES) {
37062306a36Sopenharmony_ci		bp->b_pages = bp->b_page_array;
37162306a36Sopenharmony_ci	} else {
37262306a36Sopenharmony_ci		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
37362306a36Sopenharmony_ci					gfp_mask);
37462306a36Sopenharmony_ci		if (!bp->b_pages)
37562306a36Sopenharmony_ci			return -ENOMEM;
37662306a36Sopenharmony_ci	}
37762306a36Sopenharmony_ci	bp->b_flags |= _XBF_PAGES;
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	/* Assure zeroed buffer for non-read cases. */
38062306a36Sopenharmony_ci	if (!(flags & XBF_READ))
38162306a36Sopenharmony_ci		gfp_mask |= __GFP_ZERO;
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci	/*
38462306a36Sopenharmony_ci	 * Bulk filling of pages can take multiple calls. Not filling the entire
38562306a36Sopenharmony_ci	 * array is not an allocation failure, so don't back off if we get at
38662306a36Sopenharmony_ci	 * least one extra page.
38762306a36Sopenharmony_ci	 */
38862306a36Sopenharmony_ci	for (;;) {
38962306a36Sopenharmony_ci		long	last = filled;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
39262306a36Sopenharmony_ci						bp->b_pages);
39362306a36Sopenharmony_ci		if (filled == bp->b_page_count) {
39462306a36Sopenharmony_ci			XFS_STATS_INC(bp->b_mount, xb_page_found);
39562306a36Sopenharmony_ci			break;
39662306a36Sopenharmony_ci		}
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci		if (filled != last)
39962306a36Sopenharmony_ci			continue;
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci		if (flags & XBF_READ_AHEAD) {
40262306a36Sopenharmony_ci			xfs_buf_free_pages(bp);
40362306a36Sopenharmony_ci			return -ENOMEM;
40462306a36Sopenharmony_ci		}
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci		XFS_STATS_INC(bp->b_mount, xb_page_retries);
40762306a36Sopenharmony_ci		memalloc_retry_wait(gfp_mask);
40862306a36Sopenharmony_ci	}
40962306a36Sopenharmony_ci	return 0;
41062306a36Sopenharmony_ci}
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci/*
41362306a36Sopenharmony_ci *	Map buffer into kernel address-space if necessary.
41462306a36Sopenharmony_ci */
41562306a36Sopenharmony_ciSTATIC int
41662306a36Sopenharmony_ci_xfs_buf_map_pages(
41762306a36Sopenharmony_ci	struct xfs_buf		*bp,
41862306a36Sopenharmony_ci	xfs_buf_flags_t		flags)
41962306a36Sopenharmony_ci{
42062306a36Sopenharmony_ci	ASSERT(bp->b_flags & _XBF_PAGES);
42162306a36Sopenharmony_ci	if (bp->b_page_count == 1) {
42262306a36Sopenharmony_ci		/* A single page buffer is always mappable */
42362306a36Sopenharmony_ci		bp->b_addr = page_address(bp->b_pages[0]);
42462306a36Sopenharmony_ci	} else if (flags & XBF_UNMAPPED) {
42562306a36Sopenharmony_ci		bp->b_addr = NULL;
42662306a36Sopenharmony_ci	} else {
42762306a36Sopenharmony_ci		int retried = 0;
42862306a36Sopenharmony_ci		unsigned nofs_flag;
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci		/*
43162306a36Sopenharmony_ci		 * vm_map_ram() will allocate auxiliary structures (e.g.
43262306a36Sopenharmony_ci		 * pagetables) with GFP_KERNEL, yet we are likely to be under
43362306a36Sopenharmony_ci		 * GFP_NOFS context here. Hence we need to tell memory reclaim
43462306a36Sopenharmony_ci		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
43562306a36Sopenharmony_ci		 * memory reclaim re-entering the filesystem here and
43662306a36Sopenharmony_ci		 * potentially deadlocking.
43762306a36Sopenharmony_ci		 */
43862306a36Sopenharmony_ci		nofs_flag = memalloc_nofs_save();
43962306a36Sopenharmony_ci		do {
44062306a36Sopenharmony_ci			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
44162306a36Sopenharmony_ci						-1);
44262306a36Sopenharmony_ci			if (bp->b_addr)
44362306a36Sopenharmony_ci				break;
44462306a36Sopenharmony_ci			vm_unmap_aliases();
44562306a36Sopenharmony_ci		} while (retried++ <= 1);
44662306a36Sopenharmony_ci		memalloc_nofs_restore(nofs_flag);
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci		if (!bp->b_addr)
44962306a36Sopenharmony_ci			return -ENOMEM;
45062306a36Sopenharmony_ci	}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	return 0;
45362306a36Sopenharmony_ci}
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci/*
45662306a36Sopenharmony_ci *	Finding and Reading Buffers
45762306a36Sopenharmony_ci */
45862306a36Sopenharmony_cistatic int
45962306a36Sopenharmony_ci_xfs_buf_obj_cmp(
46062306a36Sopenharmony_ci	struct rhashtable_compare_arg	*arg,
46162306a36Sopenharmony_ci	const void			*obj)
46262306a36Sopenharmony_ci{
46362306a36Sopenharmony_ci	const struct xfs_buf_map	*map = arg->key;
46462306a36Sopenharmony_ci	const struct xfs_buf		*bp = obj;
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci	/*
46762306a36Sopenharmony_ci	 * The key hashing in the lookup path depends on the key being the
46862306a36Sopenharmony_ci	 * first element of the compare_arg, make sure to assert this.
46962306a36Sopenharmony_ci	 */
47062306a36Sopenharmony_ci	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	if (bp->b_rhash_key != map->bm_bn)
47362306a36Sopenharmony_ci		return 1;
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	if (unlikely(bp->b_length != map->bm_len)) {
47662306a36Sopenharmony_ci		/*
47762306a36Sopenharmony_ci		 * found a block number match. If the range doesn't
47862306a36Sopenharmony_ci		 * match, the only way this is allowed is if the buffer
47962306a36Sopenharmony_ci		 * in the cache is stale and the transaction that made
48062306a36Sopenharmony_ci		 * it stale has not yet committed. i.e. we are
48162306a36Sopenharmony_ci		 * reallocating a busy extent. Skip this buffer and
48262306a36Sopenharmony_ci		 * continue searching for an exact match.
48362306a36Sopenharmony_ci		 */
48462306a36Sopenharmony_ci		if (!(map->bm_flags & XBM_LIVESCAN))
48562306a36Sopenharmony_ci			ASSERT(bp->b_flags & XBF_STALE);
48662306a36Sopenharmony_ci		return 1;
48762306a36Sopenharmony_ci	}
48862306a36Sopenharmony_ci	return 0;
48962306a36Sopenharmony_ci}
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_cistatic const struct rhashtable_params xfs_buf_hash_params = {
49262306a36Sopenharmony_ci	.min_size		= 32,	/* empty AGs have minimal footprint */
49362306a36Sopenharmony_ci	.nelem_hint		= 16,
49462306a36Sopenharmony_ci	.key_len		= sizeof(xfs_daddr_t),
49562306a36Sopenharmony_ci	.key_offset		= offsetof(struct xfs_buf, b_rhash_key),
49662306a36Sopenharmony_ci	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
49762306a36Sopenharmony_ci	.automatic_shrinking	= true,
49862306a36Sopenharmony_ci	.obj_cmpfn		= _xfs_buf_obj_cmp,
49962306a36Sopenharmony_ci};
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ciint
50262306a36Sopenharmony_cixfs_buf_hash_init(
50362306a36Sopenharmony_ci	struct xfs_perag	*pag)
50462306a36Sopenharmony_ci{
50562306a36Sopenharmony_ci	spin_lock_init(&pag->pag_buf_lock);
50662306a36Sopenharmony_ci	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
50762306a36Sopenharmony_ci}
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_civoid
51062306a36Sopenharmony_cixfs_buf_hash_destroy(
51162306a36Sopenharmony_ci	struct xfs_perag	*pag)
51262306a36Sopenharmony_ci{
51362306a36Sopenharmony_ci	rhashtable_destroy(&pag->pag_buf_hash);
51462306a36Sopenharmony_ci}
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_cistatic int
51762306a36Sopenharmony_cixfs_buf_map_verify(
51862306a36Sopenharmony_ci	struct xfs_buftarg	*btp,
51962306a36Sopenharmony_ci	struct xfs_buf_map	*map)
52062306a36Sopenharmony_ci{
52162306a36Sopenharmony_ci	xfs_daddr_t		eofs;
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	/* Check for IOs smaller than the sector size / not sector aligned */
52462306a36Sopenharmony_ci	ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
52562306a36Sopenharmony_ci	ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci	/*
52862306a36Sopenharmony_ci	 * Corrupted block numbers can get through to here, unfortunately, so we
52962306a36Sopenharmony_ci	 * have to check that the buffer falls within the filesystem bounds.
53062306a36Sopenharmony_ci	 */
53162306a36Sopenharmony_ci	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
53262306a36Sopenharmony_ci	if (map->bm_bn < 0 || map->bm_bn >= eofs) {
53362306a36Sopenharmony_ci		xfs_alert(btp->bt_mount,
53462306a36Sopenharmony_ci			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
53562306a36Sopenharmony_ci			  __func__, map->bm_bn, eofs);
53662306a36Sopenharmony_ci		WARN_ON(1);
53762306a36Sopenharmony_ci		return -EFSCORRUPTED;
53862306a36Sopenharmony_ci	}
53962306a36Sopenharmony_ci	return 0;
54062306a36Sopenharmony_ci}
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_cistatic int
54362306a36Sopenharmony_cixfs_buf_find_lock(
54462306a36Sopenharmony_ci	struct xfs_buf          *bp,
54562306a36Sopenharmony_ci	xfs_buf_flags_t		flags)
54662306a36Sopenharmony_ci{
54762306a36Sopenharmony_ci	if (flags & XBF_TRYLOCK) {
54862306a36Sopenharmony_ci		if (!xfs_buf_trylock(bp)) {
54962306a36Sopenharmony_ci			XFS_STATS_INC(bp->b_mount, xb_busy_locked);
55062306a36Sopenharmony_ci			return -EAGAIN;
55162306a36Sopenharmony_ci		}
55262306a36Sopenharmony_ci	} else {
55362306a36Sopenharmony_ci		xfs_buf_lock(bp);
55462306a36Sopenharmony_ci		XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
55562306a36Sopenharmony_ci	}
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	/*
55862306a36Sopenharmony_ci	 * if the buffer is stale, clear all the external state associated with
55962306a36Sopenharmony_ci	 * it. We need to keep flags such as how we allocated the buffer memory
56062306a36Sopenharmony_ci	 * intact here.
56162306a36Sopenharmony_ci	 */
56262306a36Sopenharmony_ci	if (bp->b_flags & XBF_STALE) {
56362306a36Sopenharmony_ci		if (flags & XBF_LIVESCAN) {
56462306a36Sopenharmony_ci			xfs_buf_unlock(bp);
56562306a36Sopenharmony_ci			return -ENOENT;
56662306a36Sopenharmony_ci		}
56762306a36Sopenharmony_ci		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
56862306a36Sopenharmony_ci		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
56962306a36Sopenharmony_ci		bp->b_ops = NULL;
57062306a36Sopenharmony_ci	}
57162306a36Sopenharmony_ci	return 0;
57262306a36Sopenharmony_ci}
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_cistatic inline int
57562306a36Sopenharmony_cixfs_buf_lookup(
57662306a36Sopenharmony_ci	struct xfs_perag	*pag,
57762306a36Sopenharmony_ci	struct xfs_buf_map	*map,
57862306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
57962306a36Sopenharmony_ci	struct xfs_buf		**bpp)
58062306a36Sopenharmony_ci{
58162306a36Sopenharmony_ci	struct xfs_buf          *bp;
58262306a36Sopenharmony_ci	int			error;
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci	rcu_read_lock();
58562306a36Sopenharmony_ci	bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
58662306a36Sopenharmony_ci	if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
58762306a36Sopenharmony_ci		rcu_read_unlock();
58862306a36Sopenharmony_ci		return -ENOENT;
58962306a36Sopenharmony_ci	}
59062306a36Sopenharmony_ci	rcu_read_unlock();
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_ci	error = xfs_buf_find_lock(bp, flags);
59362306a36Sopenharmony_ci	if (error) {
59462306a36Sopenharmony_ci		xfs_buf_rele(bp);
59562306a36Sopenharmony_ci		return error;
59662306a36Sopenharmony_ci	}
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	trace_xfs_buf_find(bp, flags, _RET_IP_);
59962306a36Sopenharmony_ci	*bpp = bp;
60062306a36Sopenharmony_ci	return 0;
60162306a36Sopenharmony_ci}
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci/*
60462306a36Sopenharmony_ci * Insert the new_bp into the hash table. This consumes the perag reference
60562306a36Sopenharmony_ci * taken for the lookup regardless of the result of the insert.
60662306a36Sopenharmony_ci */
60762306a36Sopenharmony_cistatic int
60862306a36Sopenharmony_cixfs_buf_find_insert(
60962306a36Sopenharmony_ci	struct xfs_buftarg	*btp,
61062306a36Sopenharmony_ci	struct xfs_perag	*pag,
61162306a36Sopenharmony_ci	struct xfs_buf_map	*cmap,
61262306a36Sopenharmony_ci	struct xfs_buf_map	*map,
61362306a36Sopenharmony_ci	int			nmaps,
61462306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
61562306a36Sopenharmony_ci	struct xfs_buf		**bpp)
61662306a36Sopenharmony_ci{
61762306a36Sopenharmony_ci	struct xfs_buf		*new_bp;
61862306a36Sopenharmony_ci	struct xfs_buf		*bp;
61962306a36Sopenharmony_ci	int			error;
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
62262306a36Sopenharmony_ci	if (error)
62362306a36Sopenharmony_ci		goto out_drop_pag;
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci	/*
62662306a36Sopenharmony_ci	 * For buffers that fit entirely within a single page, first attempt to
62762306a36Sopenharmony_ci	 * allocate the memory from the heap to minimise memory usage. If we
62862306a36Sopenharmony_ci	 * can't get heap memory for these small buffers, we fall back to using
62962306a36Sopenharmony_ci	 * the page allocator.
63062306a36Sopenharmony_ci	 */
63162306a36Sopenharmony_ci	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
63262306a36Sopenharmony_ci	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
63362306a36Sopenharmony_ci		error = xfs_buf_alloc_pages(new_bp, flags);
63462306a36Sopenharmony_ci		if (error)
63562306a36Sopenharmony_ci			goto out_free_buf;
63662306a36Sopenharmony_ci	}
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	spin_lock(&pag->pag_buf_lock);
63962306a36Sopenharmony_ci	bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
64062306a36Sopenharmony_ci			&new_bp->b_rhash_head, xfs_buf_hash_params);
64162306a36Sopenharmony_ci	if (IS_ERR(bp)) {
64262306a36Sopenharmony_ci		error = PTR_ERR(bp);
64362306a36Sopenharmony_ci		spin_unlock(&pag->pag_buf_lock);
64462306a36Sopenharmony_ci		goto out_free_buf;
64562306a36Sopenharmony_ci	}
64662306a36Sopenharmony_ci	if (bp) {
64762306a36Sopenharmony_ci		/* found an existing buffer */
64862306a36Sopenharmony_ci		atomic_inc(&bp->b_hold);
64962306a36Sopenharmony_ci		spin_unlock(&pag->pag_buf_lock);
65062306a36Sopenharmony_ci		error = xfs_buf_find_lock(bp, flags);
65162306a36Sopenharmony_ci		if (error)
65262306a36Sopenharmony_ci			xfs_buf_rele(bp);
65362306a36Sopenharmony_ci		else
65462306a36Sopenharmony_ci			*bpp = bp;
65562306a36Sopenharmony_ci		goto out_free_buf;
65662306a36Sopenharmony_ci	}
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci	/* The new buffer keeps the perag reference until it is freed. */
65962306a36Sopenharmony_ci	new_bp->b_pag = pag;
66062306a36Sopenharmony_ci	spin_unlock(&pag->pag_buf_lock);
66162306a36Sopenharmony_ci	*bpp = new_bp;
66262306a36Sopenharmony_ci	return 0;
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ciout_free_buf:
66562306a36Sopenharmony_ci	xfs_buf_free(new_bp);
66662306a36Sopenharmony_ciout_drop_pag:
66762306a36Sopenharmony_ci	xfs_perag_put(pag);
66862306a36Sopenharmony_ci	return error;
66962306a36Sopenharmony_ci}
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci/*
67262306a36Sopenharmony_ci * Assembles a buffer covering the specified range. The code is optimised for
67362306a36Sopenharmony_ci * cache hits, as metadata intensive workloads will see 3 orders of magnitude
67462306a36Sopenharmony_ci * more hits than misses.
67562306a36Sopenharmony_ci */
67662306a36Sopenharmony_ciint
67762306a36Sopenharmony_cixfs_buf_get_map(
67862306a36Sopenharmony_ci	struct xfs_buftarg	*btp,
67962306a36Sopenharmony_ci	struct xfs_buf_map	*map,
68062306a36Sopenharmony_ci	int			nmaps,
68162306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
68262306a36Sopenharmony_ci	struct xfs_buf		**bpp)
68362306a36Sopenharmony_ci{
68462306a36Sopenharmony_ci	struct xfs_perag	*pag;
68562306a36Sopenharmony_ci	struct xfs_buf		*bp = NULL;
68662306a36Sopenharmony_ci	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
68762306a36Sopenharmony_ci	int			error;
68862306a36Sopenharmony_ci	int			i;
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_ci	if (flags & XBF_LIVESCAN)
69162306a36Sopenharmony_ci		cmap.bm_flags |= XBM_LIVESCAN;
69262306a36Sopenharmony_ci	for (i = 0; i < nmaps; i++)
69362306a36Sopenharmony_ci		cmap.bm_len += map[i].bm_len;
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci	error = xfs_buf_map_verify(btp, &cmap);
69662306a36Sopenharmony_ci	if (error)
69762306a36Sopenharmony_ci		return error;
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_ci	pag = xfs_perag_get(btp->bt_mount,
70062306a36Sopenharmony_ci			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	error = xfs_buf_lookup(pag, &cmap, flags, &bp);
70362306a36Sopenharmony_ci	if (error && error != -ENOENT)
70462306a36Sopenharmony_ci		goto out_put_perag;
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	/* cache hits always outnumber misses by at least 10:1 */
70762306a36Sopenharmony_ci	if (unlikely(!bp)) {
70862306a36Sopenharmony_ci		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci		if (flags & XBF_INCORE)
71162306a36Sopenharmony_ci			goto out_put_perag;
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci		/* xfs_buf_find_insert() consumes the perag reference. */
71462306a36Sopenharmony_ci		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
71562306a36Sopenharmony_ci				flags, &bp);
71662306a36Sopenharmony_ci		if (error)
71762306a36Sopenharmony_ci			return error;
71862306a36Sopenharmony_ci	} else {
71962306a36Sopenharmony_ci		XFS_STATS_INC(btp->bt_mount, xb_get_locked);
72062306a36Sopenharmony_ci		xfs_perag_put(pag);
72162306a36Sopenharmony_ci	}
72262306a36Sopenharmony_ci
72362306a36Sopenharmony_ci	/* We do not hold a perag reference anymore. */
72462306a36Sopenharmony_ci	if (!bp->b_addr) {
72562306a36Sopenharmony_ci		error = _xfs_buf_map_pages(bp, flags);
72662306a36Sopenharmony_ci		if (unlikely(error)) {
72762306a36Sopenharmony_ci			xfs_warn_ratelimited(btp->bt_mount,
72862306a36Sopenharmony_ci				"%s: failed to map %u pages", __func__,
72962306a36Sopenharmony_ci				bp->b_page_count);
73062306a36Sopenharmony_ci			xfs_buf_relse(bp);
73162306a36Sopenharmony_ci			return error;
73262306a36Sopenharmony_ci		}
73362306a36Sopenharmony_ci	}
73462306a36Sopenharmony_ci
73562306a36Sopenharmony_ci	/*
73662306a36Sopenharmony_ci	 * Clear b_error if this is a lookup from a caller that doesn't expect
73762306a36Sopenharmony_ci	 * valid data to be found in the buffer.
73862306a36Sopenharmony_ci	 */
73962306a36Sopenharmony_ci	if (!(flags & XBF_READ))
74062306a36Sopenharmony_ci		xfs_buf_ioerror(bp, 0);
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_ci	XFS_STATS_INC(btp->bt_mount, xb_get);
74362306a36Sopenharmony_ci	trace_xfs_buf_get(bp, flags, _RET_IP_);
74462306a36Sopenharmony_ci	*bpp = bp;
74562306a36Sopenharmony_ci	return 0;
74662306a36Sopenharmony_ci
74762306a36Sopenharmony_ciout_put_perag:
74862306a36Sopenharmony_ci	xfs_perag_put(pag);
74962306a36Sopenharmony_ci	return error;
75062306a36Sopenharmony_ci}
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ciint
75362306a36Sopenharmony_ci_xfs_buf_read(
75462306a36Sopenharmony_ci	struct xfs_buf		*bp,
75562306a36Sopenharmony_ci	xfs_buf_flags_t		flags)
75662306a36Sopenharmony_ci{
75762306a36Sopenharmony_ci	ASSERT(!(flags & XBF_WRITE));
75862306a36Sopenharmony_ci	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
76162306a36Sopenharmony_ci	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ci	return xfs_buf_submit(bp);
76462306a36Sopenharmony_ci}
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci/*
76762306a36Sopenharmony_ci * Reverify a buffer found in cache without an attached ->b_ops.
76862306a36Sopenharmony_ci *
76962306a36Sopenharmony_ci * If the caller passed an ops structure and the buffer doesn't have ops
77062306a36Sopenharmony_ci * assigned, set the ops and use it to verify the contents. If verification
77162306a36Sopenharmony_ci * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
77262306a36Sopenharmony_ci * already in XBF_DONE state on entry.
77362306a36Sopenharmony_ci *
77462306a36Sopenharmony_ci * Under normal operations, every in-core buffer is verified on read I/O
77562306a36Sopenharmony_ci * completion. There are two scenarios that can lead to in-core buffers without
77662306a36Sopenharmony_ci * an assigned ->b_ops. The first is during log recovery of buffers on a V4
77762306a36Sopenharmony_ci * filesystem, though these buffers are purged at the end of recovery. The
77862306a36Sopenharmony_ci * other is online repair, which intentionally reads with a NULL buffer ops to
77962306a36Sopenharmony_ci * run several verifiers across an in-core buffer in order to establish buffer
78062306a36Sopenharmony_ci * type.  If repair can't establish that, the buffer will be left in memory
78162306a36Sopenharmony_ci * with NULL buffer ops.
78262306a36Sopenharmony_ci */
78362306a36Sopenharmony_ciint
78462306a36Sopenharmony_cixfs_buf_reverify(
78562306a36Sopenharmony_ci	struct xfs_buf		*bp,
78662306a36Sopenharmony_ci	const struct xfs_buf_ops *ops)
78762306a36Sopenharmony_ci{
78862306a36Sopenharmony_ci	ASSERT(bp->b_flags & XBF_DONE);
78962306a36Sopenharmony_ci	ASSERT(bp->b_error == 0);
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	if (!ops || bp->b_ops)
79262306a36Sopenharmony_ci		return 0;
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci	bp->b_ops = ops;
79562306a36Sopenharmony_ci	bp->b_ops->verify_read(bp);
79662306a36Sopenharmony_ci	if (bp->b_error)
79762306a36Sopenharmony_ci		bp->b_flags &= ~XBF_DONE;
79862306a36Sopenharmony_ci	return bp->b_error;
79962306a36Sopenharmony_ci}
80062306a36Sopenharmony_ci
80162306a36Sopenharmony_ciint
80262306a36Sopenharmony_cixfs_buf_read_map(
80362306a36Sopenharmony_ci	struct xfs_buftarg	*target,
80462306a36Sopenharmony_ci	struct xfs_buf_map	*map,
80562306a36Sopenharmony_ci	int			nmaps,
80662306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
80762306a36Sopenharmony_ci	struct xfs_buf		**bpp,
80862306a36Sopenharmony_ci	const struct xfs_buf_ops *ops,
80962306a36Sopenharmony_ci	xfs_failaddr_t		fa)
81062306a36Sopenharmony_ci{
81162306a36Sopenharmony_ci	struct xfs_buf		*bp;
81262306a36Sopenharmony_ci	int			error;
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci	flags |= XBF_READ;
81562306a36Sopenharmony_ci	*bpp = NULL;
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_ci	error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
81862306a36Sopenharmony_ci	if (error)
81962306a36Sopenharmony_ci		return error;
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci	trace_xfs_buf_read(bp, flags, _RET_IP_);
82262306a36Sopenharmony_ci
82362306a36Sopenharmony_ci	if (!(bp->b_flags & XBF_DONE)) {
82462306a36Sopenharmony_ci		/* Initiate the buffer read and wait. */
82562306a36Sopenharmony_ci		XFS_STATS_INC(target->bt_mount, xb_get_read);
82662306a36Sopenharmony_ci		bp->b_ops = ops;
82762306a36Sopenharmony_ci		error = _xfs_buf_read(bp, flags);
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci		/* Readahead iodone already dropped the buffer, so exit. */
83062306a36Sopenharmony_ci		if (flags & XBF_ASYNC)
83162306a36Sopenharmony_ci			return 0;
83262306a36Sopenharmony_ci	} else {
83362306a36Sopenharmony_ci		/* Buffer already read; all we need to do is check it. */
83462306a36Sopenharmony_ci		error = xfs_buf_reverify(bp, ops);
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci		/* Readahead already finished; drop the buffer and exit. */
83762306a36Sopenharmony_ci		if (flags & XBF_ASYNC) {
83862306a36Sopenharmony_ci			xfs_buf_relse(bp);
83962306a36Sopenharmony_ci			return 0;
84062306a36Sopenharmony_ci		}
84162306a36Sopenharmony_ci
84262306a36Sopenharmony_ci		/* We do not want read in the flags */
84362306a36Sopenharmony_ci		bp->b_flags &= ~XBF_READ;
84462306a36Sopenharmony_ci		ASSERT(bp->b_ops != NULL || ops == NULL);
84562306a36Sopenharmony_ci	}
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	/*
84862306a36Sopenharmony_ci	 * If we've had a read error, then the contents of the buffer are
84962306a36Sopenharmony_ci	 * invalid and should not be used. To ensure that a followup read tries
85062306a36Sopenharmony_ci	 * to pull the buffer from disk again, we clear the XBF_DONE flag and
85162306a36Sopenharmony_ci	 * mark the buffer stale. This ensures that anyone who has a current
85262306a36Sopenharmony_ci	 * reference to the buffer will interpret it's contents correctly and
85362306a36Sopenharmony_ci	 * future cache lookups will also treat it as an empty, uninitialised
85462306a36Sopenharmony_ci	 * buffer.
85562306a36Sopenharmony_ci	 */
85662306a36Sopenharmony_ci	if (error) {
85762306a36Sopenharmony_ci		/*
85862306a36Sopenharmony_ci		 * Check against log shutdown for error reporting because
85962306a36Sopenharmony_ci		 * metadata writeback may require a read first and we need to
86062306a36Sopenharmony_ci		 * report errors in metadata writeback until the log is shut
86162306a36Sopenharmony_ci		 * down. High level transaction read functions already check
86262306a36Sopenharmony_ci		 * against mount shutdown, anyway, so we only need to be
86362306a36Sopenharmony_ci		 * concerned about low level IO interactions here.
86462306a36Sopenharmony_ci		 */
86562306a36Sopenharmony_ci		if (!xlog_is_shutdown(target->bt_mount->m_log))
86662306a36Sopenharmony_ci			xfs_buf_ioerror_alert(bp, fa);
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci		bp->b_flags &= ~XBF_DONE;
86962306a36Sopenharmony_ci		xfs_buf_stale(bp);
87062306a36Sopenharmony_ci		xfs_buf_relse(bp);
87162306a36Sopenharmony_ci
87262306a36Sopenharmony_ci		/* bad CRC means corrupted metadata */
87362306a36Sopenharmony_ci		if (error == -EFSBADCRC)
87462306a36Sopenharmony_ci			error = -EFSCORRUPTED;
87562306a36Sopenharmony_ci		return error;
87662306a36Sopenharmony_ci	}
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_ci	*bpp = bp;
87962306a36Sopenharmony_ci	return 0;
88062306a36Sopenharmony_ci}
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci/*
88362306a36Sopenharmony_ci *	If we are not low on memory then do the readahead in a deadlock
88462306a36Sopenharmony_ci *	safe manner.
88562306a36Sopenharmony_ci */
88662306a36Sopenharmony_civoid
88762306a36Sopenharmony_cixfs_buf_readahead_map(
88862306a36Sopenharmony_ci	struct xfs_buftarg	*target,
88962306a36Sopenharmony_ci	struct xfs_buf_map	*map,
89062306a36Sopenharmony_ci	int			nmaps,
89162306a36Sopenharmony_ci	const struct xfs_buf_ops *ops)
89262306a36Sopenharmony_ci{
89362306a36Sopenharmony_ci	struct xfs_buf		*bp;
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci	xfs_buf_read_map(target, map, nmaps,
89662306a36Sopenharmony_ci		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
89762306a36Sopenharmony_ci		     __this_address);
89862306a36Sopenharmony_ci}
89962306a36Sopenharmony_ci
90062306a36Sopenharmony_ci/*
90162306a36Sopenharmony_ci * Read an uncached buffer from disk. Allocates and returns a locked
90262306a36Sopenharmony_ci * buffer containing the disk contents or nothing. Uncached buffers always have
90362306a36Sopenharmony_ci * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
90462306a36Sopenharmony_ci * is cached or uncached during fault diagnosis.
90562306a36Sopenharmony_ci */
90662306a36Sopenharmony_ciint
90762306a36Sopenharmony_cixfs_buf_read_uncached(
90862306a36Sopenharmony_ci	struct xfs_buftarg	*target,
90962306a36Sopenharmony_ci	xfs_daddr_t		daddr,
91062306a36Sopenharmony_ci	size_t			numblks,
91162306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
91262306a36Sopenharmony_ci	struct xfs_buf		**bpp,
91362306a36Sopenharmony_ci	const struct xfs_buf_ops *ops)
91462306a36Sopenharmony_ci{
91562306a36Sopenharmony_ci	struct xfs_buf		*bp;
91662306a36Sopenharmony_ci	int			error;
91762306a36Sopenharmony_ci
91862306a36Sopenharmony_ci	*bpp = NULL;
91962306a36Sopenharmony_ci
92062306a36Sopenharmony_ci	error = xfs_buf_get_uncached(target, numblks, flags, &bp);
92162306a36Sopenharmony_ci	if (error)
92262306a36Sopenharmony_ci		return error;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	/* set up the buffer for a read IO */
92562306a36Sopenharmony_ci	ASSERT(bp->b_map_count == 1);
92662306a36Sopenharmony_ci	bp->b_rhash_key = XFS_BUF_DADDR_NULL;
92762306a36Sopenharmony_ci	bp->b_maps[0].bm_bn = daddr;
92862306a36Sopenharmony_ci	bp->b_flags |= XBF_READ;
92962306a36Sopenharmony_ci	bp->b_ops = ops;
93062306a36Sopenharmony_ci
93162306a36Sopenharmony_ci	xfs_buf_submit(bp);
93262306a36Sopenharmony_ci	if (bp->b_error) {
93362306a36Sopenharmony_ci		error = bp->b_error;
93462306a36Sopenharmony_ci		xfs_buf_relse(bp);
93562306a36Sopenharmony_ci		return error;
93662306a36Sopenharmony_ci	}
93762306a36Sopenharmony_ci
93862306a36Sopenharmony_ci	*bpp = bp;
93962306a36Sopenharmony_ci	return 0;
94062306a36Sopenharmony_ci}
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ciint
94362306a36Sopenharmony_cixfs_buf_get_uncached(
94462306a36Sopenharmony_ci	struct xfs_buftarg	*target,
94562306a36Sopenharmony_ci	size_t			numblks,
94662306a36Sopenharmony_ci	xfs_buf_flags_t		flags,
94762306a36Sopenharmony_ci	struct xfs_buf		**bpp)
94862306a36Sopenharmony_ci{
94962306a36Sopenharmony_ci	int			error;
95062306a36Sopenharmony_ci	struct xfs_buf		*bp;
95162306a36Sopenharmony_ci	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_ci	*bpp = NULL;
95462306a36Sopenharmony_ci
95562306a36Sopenharmony_ci	/* flags might contain irrelevant bits, pass only what we care about */
95662306a36Sopenharmony_ci	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
95762306a36Sopenharmony_ci	if (error)
95862306a36Sopenharmony_ci		return error;
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci	error = xfs_buf_alloc_pages(bp, flags);
96162306a36Sopenharmony_ci	if (error)
96262306a36Sopenharmony_ci		goto fail_free_buf;
96362306a36Sopenharmony_ci
96462306a36Sopenharmony_ci	error = _xfs_buf_map_pages(bp, 0);
96562306a36Sopenharmony_ci	if (unlikely(error)) {
96662306a36Sopenharmony_ci		xfs_warn(target->bt_mount,
96762306a36Sopenharmony_ci			"%s: failed to map pages", __func__);
96862306a36Sopenharmony_ci		goto fail_free_buf;
96962306a36Sopenharmony_ci	}
97062306a36Sopenharmony_ci
97162306a36Sopenharmony_ci	trace_xfs_buf_get_uncached(bp, _RET_IP_);
97262306a36Sopenharmony_ci	*bpp = bp;
97362306a36Sopenharmony_ci	return 0;
97462306a36Sopenharmony_ci
97562306a36Sopenharmony_cifail_free_buf:
97662306a36Sopenharmony_ci	xfs_buf_free(bp);
97762306a36Sopenharmony_ci	return error;
97862306a36Sopenharmony_ci}
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci/*
98162306a36Sopenharmony_ci *	Increment reference count on buffer, to hold the buffer concurrently
98262306a36Sopenharmony_ci *	with another thread which may release (free) the buffer asynchronously.
98362306a36Sopenharmony_ci *	Must hold the buffer already to call this function.
98462306a36Sopenharmony_ci */
98562306a36Sopenharmony_civoid
98662306a36Sopenharmony_cixfs_buf_hold(
98762306a36Sopenharmony_ci	struct xfs_buf		*bp)
98862306a36Sopenharmony_ci{
98962306a36Sopenharmony_ci	trace_xfs_buf_hold(bp, _RET_IP_);
99062306a36Sopenharmony_ci	atomic_inc(&bp->b_hold);
99162306a36Sopenharmony_ci}
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_ci/*
99462306a36Sopenharmony_ci * Release a hold on the specified buffer. If the hold count is 1, the buffer is
99562306a36Sopenharmony_ci * placed on LRU or freed (depending on b_lru_ref).
99662306a36Sopenharmony_ci */
99762306a36Sopenharmony_civoid
99862306a36Sopenharmony_cixfs_buf_rele(
99962306a36Sopenharmony_ci	struct xfs_buf		*bp)
100062306a36Sopenharmony_ci{
100162306a36Sopenharmony_ci	struct xfs_perag	*pag = bp->b_pag;
100262306a36Sopenharmony_ci	bool			release;
100362306a36Sopenharmony_ci	bool			freebuf = false;
100462306a36Sopenharmony_ci
100562306a36Sopenharmony_ci	trace_xfs_buf_rele(bp, _RET_IP_);
100662306a36Sopenharmony_ci
100762306a36Sopenharmony_ci	if (!pag) {
100862306a36Sopenharmony_ci		ASSERT(list_empty(&bp->b_lru));
100962306a36Sopenharmony_ci		if (atomic_dec_and_test(&bp->b_hold)) {
101062306a36Sopenharmony_ci			xfs_buf_ioacct_dec(bp);
101162306a36Sopenharmony_ci			xfs_buf_free(bp);
101262306a36Sopenharmony_ci		}
101362306a36Sopenharmony_ci		return;
101462306a36Sopenharmony_ci	}
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci	ASSERT(atomic_read(&bp->b_hold) > 0);
101762306a36Sopenharmony_ci
101862306a36Sopenharmony_ci	/*
101962306a36Sopenharmony_ci	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
102062306a36Sopenharmony_ci	 * calls. The pag_buf_lock being taken on the last reference only
102162306a36Sopenharmony_ci	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
102262306a36Sopenharmony_ci	 * to last reference we drop here is not serialised against the last
102362306a36Sopenharmony_ci	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
102462306a36Sopenharmony_ci	 * first, the last "release" reference can win the race to the lock and
102562306a36Sopenharmony_ci	 * free the buffer before the second-to-last reference is processed,
102662306a36Sopenharmony_ci	 * leading to a use-after-free scenario.
102762306a36Sopenharmony_ci	 */
102862306a36Sopenharmony_ci	spin_lock(&bp->b_lock);
102962306a36Sopenharmony_ci	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
103062306a36Sopenharmony_ci	if (!release) {
103162306a36Sopenharmony_ci		/*
103262306a36Sopenharmony_ci		 * Drop the in-flight state if the buffer is already on the LRU
103362306a36Sopenharmony_ci		 * and it holds the only reference. This is racy because we
103462306a36Sopenharmony_ci		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
103562306a36Sopenharmony_ci		 * ensures the decrement occurs only once per-buf.
103662306a36Sopenharmony_ci		 */
103762306a36Sopenharmony_ci		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
103862306a36Sopenharmony_ci			__xfs_buf_ioacct_dec(bp);
103962306a36Sopenharmony_ci		goto out_unlock;
104062306a36Sopenharmony_ci	}
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_ci	/* the last reference has been dropped ... */
104362306a36Sopenharmony_ci	__xfs_buf_ioacct_dec(bp);
104462306a36Sopenharmony_ci	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
104562306a36Sopenharmony_ci		/*
104662306a36Sopenharmony_ci		 * If the buffer is added to the LRU take a new reference to the
104762306a36Sopenharmony_ci		 * buffer for the LRU and clear the (now stale) dispose list
104862306a36Sopenharmony_ci		 * state flag
104962306a36Sopenharmony_ci		 */
105062306a36Sopenharmony_ci		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
105162306a36Sopenharmony_ci			bp->b_state &= ~XFS_BSTATE_DISPOSE;
105262306a36Sopenharmony_ci			atomic_inc(&bp->b_hold);
105362306a36Sopenharmony_ci		}
105462306a36Sopenharmony_ci		spin_unlock(&pag->pag_buf_lock);
105562306a36Sopenharmony_ci	} else {
105662306a36Sopenharmony_ci		/*
105762306a36Sopenharmony_ci		 * most of the time buffers will already be removed from the
105862306a36Sopenharmony_ci		 * LRU, so optimise that case by checking for the
105962306a36Sopenharmony_ci		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
106062306a36Sopenharmony_ci		 * was on was the disposal list
106162306a36Sopenharmony_ci		 */
106262306a36Sopenharmony_ci		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
106362306a36Sopenharmony_ci			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
106462306a36Sopenharmony_ci		} else {
106562306a36Sopenharmony_ci			ASSERT(list_empty(&bp->b_lru));
106662306a36Sopenharmony_ci		}
106762306a36Sopenharmony_ci
106862306a36Sopenharmony_ci		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
106962306a36Sopenharmony_ci		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
107062306a36Sopenharmony_ci				       xfs_buf_hash_params);
107162306a36Sopenharmony_ci		spin_unlock(&pag->pag_buf_lock);
107262306a36Sopenharmony_ci		xfs_perag_put(pag);
107362306a36Sopenharmony_ci		freebuf = true;
107462306a36Sopenharmony_ci	}
107562306a36Sopenharmony_ci
107662306a36Sopenharmony_ciout_unlock:
107762306a36Sopenharmony_ci	spin_unlock(&bp->b_lock);
107862306a36Sopenharmony_ci
107962306a36Sopenharmony_ci	if (freebuf)
108062306a36Sopenharmony_ci		xfs_buf_free(bp);
108162306a36Sopenharmony_ci}
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci
108462306a36Sopenharmony_ci/*
108562306a36Sopenharmony_ci *	Lock a buffer object, if it is not already locked.
108662306a36Sopenharmony_ci *
108762306a36Sopenharmony_ci *	If we come across a stale, pinned, locked buffer, we know that we are
108862306a36Sopenharmony_ci *	being asked to lock a buffer that has been reallocated. Because it is
108962306a36Sopenharmony_ci *	pinned, we know that the log has not been pushed to disk and hence it
109062306a36Sopenharmony_ci *	will still be locked.  Rather than continuing to have trylock attempts
109162306a36Sopenharmony_ci *	fail until someone else pushes the log, push it ourselves before
109262306a36Sopenharmony_ci *	returning.  This means that the xfsaild will not get stuck trying
109362306a36Sopenharmony_ci *	to push on stale inode buffers.
109462306a36Sopenharmony_ci */
109562306a36Sopenharmony_ciint
109662306a36Sopenharmony_cixfs_buf_trylock(
109762306a36Sopenharmony_ci	struct xfs_buf		*bp)
109862306a36Sopenharmony_ci{
109962306a36Sopenharmony_ci	int			locked;
110062306a36Sopenharmony_ci
110162306a36Sopenharmony_ci	locked = down_trylock(&bp->b_sema) == 0;
110262306a36Sopenharmony_ci	if (locked)
110362306a36Sopenharmony_ci		trace_xfs_buf_trylock(bp, _RET_IP_);
110462306a36Sopenharmony_ci	else
110562306a36Sopenharmony_ci		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
110662306a36Sopenharmony_ci	return locked;
110762306a36Sopenharmony_ci}
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci/*
111062306a36Sopenharmony_ci *	Lock a buffer object.
111162306a36Sopenharmony_ci *
111262306a36Sopenharmony_ci *	If we come across a stale, pinned, locked buffer, we know that we
111362306a36Sopenharmony_ci *	are being asked to lock a buffer that has been reallocated. Because
111462306a36Sopenharmony_ci *	it is pinned, we know that the log has not been pushed to disk and
111562306a36Sopenharmony_ci *	hence it will still be locked. Rather than sleeping until someone
111662306a36Sopenharmony_ci *	else pushes the log, push it ourselves before trying to get the lock.
111762306a36Sopenharmony_ci */
111862306a36Sopenharmony_civoid
111962306a36Sopenharmony_cixfs_buf_lock(
112062306a36Sopenharmony_ci	struct xfs_buf		*bp)
112162306a36Sopenharmony_ci{
112262306a36Sopenharmony_ci	trace_xfs_buf_lock(bp, _RET_IP_);
112362306a36Sopenharmony_ci
112462306a36Sopenharmony_ci	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
112562306a36Sopenharmony_ci		xfs_log_force(bp->b_mount, 0);
112662306a36Sopenharmony_ci	down(&bp->b_sema);
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	trace_xfs_buf_lock_done(bp, _RET_IP_);
112962306a36Sopenharmony_ci}
113062306a36Sopenharmony_ci
113162306a36Sopenharmony_civoid
113262306a36Sopenharmony_cixfs_buf_unlock(
113362306a36Sopenharmony_ci	struct xfs_buf		*bp)
113462306a36Sopenharmony_ci{
113562306a36Sopenharmony_ci	ASSERT(xfs_buf_islocked(bp));
113662306a36Sopenharmony_ci
113762306a36Sopenharmony_ci	up(&bp->b_sema);
113862306a36Sopenharmony_ci	trace_xfs_buf_unlock(bp, _RET_IP_);
113962306a36Sopenharmony_ci}
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ciSTATIC void
114262306a36Sopenharmony_cixfs_buf_wait_unpin(
114362306a36Sopenharmony_ci	struct xfs_buf		*bp)
114462306a36Sopenharmony_ci{
114562306a36Sopenharmony_ci	DECLARE_WAITQUEUE	(wait, current);
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci	if (atomic_read(&bp->b_pin_count) == 0)
114862306a36Sopenharmony_ci		return;
114962306a36Sopenharmony_ci
115062306a36Sopenharmony_ci	add_wait_queue(&bp->b_waiters, &wait);
115162306a36Sopenharmony_ci	for (;;) {
115262306a36Sopenharmony_ci		set_current_state(TASK_UNINTERRUPTIBLE);
115362306a36Sopenharmony_ci		if (atomic_read(&bp->b_pin_count) == 0)
115462306a36Sopenharmony_ci			break;
115562306a36Sopenharmony_ci		io_schedule();
115662306a36Sopenharmony_ci	}
115762306a36Sopenharmony_ci	remove_wait_queue(&bp->b_waiters, &wait);
115862306a36Sopenharmony_ci	set_current_state(TASK_RUNNING);
115962306a36Sopenharmony_ci}
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_cistatic void
116262306a36Sopenharmony_cixfs_buf_ioerror_alert_ratelimited(
116362306a36Sopenharmony_ci	struct xfs_buf		*bp)
116462306a36Sopenharmony_ci{
116562306a36Sopenharmony_ci	static unsigned long	lasttime;
116662306a36Sopenharmony_ci	static struct xfs_buftarg *lasttarg;
116762306a36Sopenharmony_ci
116862306a36Sopenharmony_ci	if (bp->b_target != lasttarg ||
116962306a36Sopenharmony_ci	    time_after(jiffies, (lasttime + 5*HZ))) {
117062306a36Sopenharmony_ci		lasttime = jiffies;
117162306a36Sopenharmony_ci		xfs_buf_ioerror_alert(bp, __this_address);
117262306a36Sopenharmony_ci	}
117362306a36Sopenharmony_ci	lasttarg = bp->b_target;
117462306a36Sopenharmony_ci}
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_ci/*
117762306a36Sopenharmony_ci * Account for this latest trip around the retry handler, and decide if
117862306a36Sopenharmony_ci * we've failed enough times to constitute a permanent failure.
117962306a36Sopenharmony_ci */
118062306a36Sopenharmony_cistatic bool
118162306a36Sopenharmony_cixfs_buf_ioerror_permanent(
118262306a36Sopenharmony_ci	struct xfs_buf		*bp,
118362306a36Sopenharmony_ci	struct xfs_error_cfg	*cfg)
118462306a36Sopenharmony_ci{
118562306a36Sopenharmony_ci	struct xfs_mount	*mp = bp->b_mount;
118662306a36Sopenharmony_ci
118762306a36Sopenharmony_ci	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
118862306a36Sopenharmony_ci	    ++bp->b_retries > cfg->max_retries)
118962306a36Sopenharmony_ci		return true;
119062306a36Sopenharmony_ci	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
119162306a36Sopenharmony_ci	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
119262306a36Sopenharmony_ci		return true;
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci	/* At unmount we may treat errors differently */
119562306a36Sopenharmony_ci	if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
119662306a36Sopenharmony_ci		return true;
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci	return false;
119962306a36Sopenharmony_ci}
120062306a36Sopenharmony_ci
120162306a36Sopenharmony_ci/*
120262306a36Sopenharmony_ci * On a sync write or shutdown we just want to stale the buffer and let the
120362306a36Sopenharmony_ci * caller handle the error in bp->b_error appropriately.
120462306a36Sopenharmony_ci *
120562306a36Sopenharmony_ci * If the write was asynchronous then no one will be looking for the error.  If
120662306a36Sopenharmony_ci * this is the first failure of this type, clear the error state and write the
120762306a36Sopenharmony_ci * buffer out again. This means we always retry an async write failure at least
120862306a36Sopenharmony_ci * once, but we also need to set the buffer up to behave correctly now for
120962306a36Sopenharmony_ci * repeated failures.
121062306a36Sopenharmony_ci *
121162306a36Sopenharmony_ci * If we get repeated async write failures, then we take action according to the
121262306a36Sopenharmony_ci * error configuration we have been set up to use.
121362306a36Sopenharmony_ci *
121462306a36Sopenharmony_ci * Returns true if this function took care of error handling and the caller must
121562306a36Sopenharmony_ci * not touch the buffer again.  Return false if the caller should proceed with
121662306a36Sopenharmony_ci * normal I/O completion handling.
121762306a36Sopenharmony_ci */
121862306a36Sopenharmony_cistatic bool
121962306a36Sopenharmony_cixfs_buf_ioend_handle_error(
122062306a36Sopenharmony_ci	struct xfs_buf		*bp)
122162306a36Sopenharmony_ci{
122262306a36Sopenharmony_ci	struct xfs_mount	*mp = bp->b_mount;
122362306a36Sopenharmony_ci	struct xfs_error_cfg	*cfg;
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci	/*
122662306a36Sopenharmony_ci	 * If we've already shutdown the journal because of I/O errors, there's
122762306a36Sopenharmony_ci	 * no point in giving this a retry.
122862306a36Sopenharmony_ci	 */
122962306a36Sopenharmony_ci	if (xlog_is_shutdown(mp->m_log))
123062306a36Sopenharmony_ci		goto out_stale;
123162306a36Sopenharmony_ci
123262306a36Sopenharmony_ci	xfs_buf_ioerror_alert_ratelimited(bp);
123362306a36Sopenharmony_ci
123462306a36Sopenharmony_ci	/*
123562306a36Sopenharmony_ci	 * We're not going to bother about retrying this during recovery.
123662306a36Sopenharmony_ci	 * One strike!
123762306a36Sopenharmony_ci	 */
123862306a36Sopenharmony_ci	if (bp->b_flags & _XBF_LOGRECOVERY) {
123962306a36Sopenharmony_ci		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
124062306a36Sopenharmony_ci		return false;
124162306a36Sopenharmony_ci	}
124262306a36Sopenharmony_ci
124362306a36Sopenharmony_ci	/*
124462306a36Sopenharmony_ci	 * Synchronous writes will have callers process the error.
124562306a36Sopenharmony_ci	 */
124662306a36Sopenharmony_ci	if (!(bp->b_flags & XBF_ASYNC))
124762306a36Sopenharmony_ci		goto out_stale;
124862306a36Sopenharmony_ci
124962306a36Sopenharmony_ci	trace_xfs_buf_iodone_async(bp, _RET_IP_);
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_ci	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
125262306a36Sopenharmony_ci	if (bp->b_last_error != bp->b_error ||
125362306a36Sopenharmony_ci	    !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
125462306a36Sopenharmony_ci		bp->b_last_error = bp->b_error;
125562306a36Sopenharmony_ci		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
125662306a36Sopenharmony_ci		    !bp->b_first_retry_time)
125762306a36Sopenharmony_ci			bp->b_first_retry_time = jiffies;
125862306a36Sopenharmony_ci		goto resubmit;
125962306a36Sopenharmony_ci	}
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	/*
126262306a36Sopenharmony_ci	 * Permanent error - we need to trigger a shutdown if we haven't already
126362306a36Sopenharmony_ci	 * to indicate that inconsistency will result from this action.
126462306a36Sopenharmony_ci	 */
126562306a36Sopenharmony_ci	if (xfs_buf_ioerror_permanent(bp, cfg)) {
126662306a36Sopenharmony_ci		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
126762306a36Sopenharmony_ci		goto out_stale;
126862306a36Sopenharmony_ci	}
126962306a36Sopenharmony_ci
127062306a36Sopenharmony_ci	/* Still considered a transient error. Caller will schedule retries. */
127162306a36Sopenharmony_ci	if (bp->b_flags & _XBF_INODES)
127262306a36Sopenharmony_ci		xfs_buf_inode_io_fail(bp);
127362306a36Sopenharmony_ci	else if (bp->b_flags & _XBF_DQUOTS)
127462306a36Sopenharmony_ci		xfs_buf_dquot_io_fail(bp);
127562306a36Sopenharmony_ci	else
127662306a36Sopenharmony_ci		ASSERT(list_empty(&bp->b_li_list));
127762306a36Sopenharmony_ci	xfs_buf_ioerror(bp, 0);
127862306a36Sopenharmony_ci	xfs_buf_relse(bp);
127962306a36Sopenharmony_ci	return true;
128062306a36Sopenharmony_ci
128162306a36Sopenharmony_ciresubmit:
128262306a36Sopenharmony_ci	xfs_buf_ioerror(bp, 0);
128362306a36Sopenharmony_ci	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
128462306a36Sopenharmony_ci	xfs_buf_submit(bp);
128562306a36Sopenharmony_ci	return true;
128662306a36Sopenharmony_ciout_stale:
128762306a36Sopenharmony_ci	xfs_buf_stale(bp);
128862306a36Sopenharmony_ci	bp->b_flags |= XBF_DONE;
128962306a36Sopenharmony_ci	bp->b_flags &= ~XBF_WRITE;
129062306a36Sopenharmony_ci	trace_xfs_buf_error_relse(bp, _RET_IP_);
129162306a36Sopenharmony_ci	return false;
129262306a36Sopenharmony_ci}
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_cistatic void
129562306a36Sopenharmony_cixfs_buf_ioend(
129662306a36Sopenharmony_ci	struct xfs_buf	*bp)
129762306a36Sopenharmony_ci{
129862306a36Sopenharmony_ci	trace_xfs_buf_iodone(bp, _RET_IP_);
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci	/*
130162306a36Sopenharmony_ci	 * Pull in IO completion errors now. We are guaranteed to be running
130262306a36Sopenharmony_ci	 * single threaded, so we don't need the lock to read b_io_error.
130362306a36Sopenharmony_ci	 */
130462306a36Sopenharmony_ci	if (!bp->b_error && bp->b_io_error)
130562306a36Sopenharmony_ci		xfs_buf_ioerror(bp, bp->b_io_error);
130662306a36Sopenharmony_ci
130762306a36Sopenharmony_ci	if (bp->b_flags & XBF_READ) {
130862306a36Sopenharmony_ci		if (!bp->b_error && bp->b_ops)
130962306a36Sopenharmony_ci			bp->b_ops->verify_read(bp);
131062306a36Sopenharmony_ci		if (!bp->b_error)
131162306a36Sopenharmony_ci			bp->b_flags |= XBF_DONE;
131262306a36Sopenharmony_ci	} else {
131362306a36Sopenharmony_ci		if (!bp->b_error) {
131462306a36Sopenharmony_ci			bp->b_flags &= ~XBF_WRITE_FAIL;
131562306a36Sopenharmony_ci			bp->b_flags |= XBF_DONE;
131662306a36Sopenharmony_ci		}
131762306a36Sopenharmony_ci
131862306a36Sopenharmony_ci		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
131962306a36Sopenharmony_ci			return;
132062306a36Sopenharmony_ci
132162306a36Sopenharmony_ci		/* clear the retry state */
132262306a36Sopenharmony_ci		bp->b_last_error = 0;
132362306a36Sopenharmony_ci		bp->b_retries = 0;
132462306a36Sopenharmony_ci		bp->b_first_retry_time = 0;
132562306a36Sopenharmony_ci
132662306a36Sopenharmony_ci		/*
132762306a36Sopenharmony_ci		 * Note that for things like remote attribute buffers, there may
132862306a36Sopenharmony_ci		 * not be a buffer log item here, so processing the buffer log
132962306a36Sopenharmony_ci		 * item must remain optional.
133062306a36Sopenharmony_ci		 */
133162306a36Sopenharmony_ci		if (bp->b_log_item)
133262306a36Sopenharmony_ci			xfs_buf_item_done(bp);
133362306a36Sopenharmony_ci
133462306a36Sopenharmony_ci		if (bp->b_flags & _XBF_INODES)
133562306a36Sopenharmony_ci			xfs_buf_inode_iodone(bp);
133662306a36Sopenharmony_ci		else if (bp->b_flags & _XBF_DQUOTS)
133762306a36Sopenharmony_ci			xfs_buf_dquot_iodone(bp);
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ci	}
134062306a36Sopenharmony_ci
134162306a36Sopenharmony_ci	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
134262306a36Sopenharmony_ci			 _XBF_LOGRECOVERY);
134362306a36Sopenharmony_ci
134462306a36Sopenharmony_ci	if (bp->b_flags & XBF_ASYNC)
134562306a36Sopenharmony_ci		xfs_buf_relse(bp);
134662306a36Sopenharmony_ci	else
134762306a36Sopenharmony_ci		complete(&bp->b_iowait);
134862306a36Sopenharmony_ci}
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_cistatic void
135162306a36Sopenharmony_cixfs_buf_ioend_work(
135262306a36Sopenharmony_ci	struct work_struct	*work)
135362306a36Sopenharmony_ci{
135462306a36Sopenharmony_ci	struct xfs_buf		*bp =
135562306a36Sopenharmony_ci		container_of(work, struct xfs_buf, b_ioend_work);
135662306a36Sopenharmony_ci
135762306a36Sopenharmony_ci	xfs_buf_ioend(bp);
135862306a36Sopenharmony_ci}
135962306a36Sopenharmony_ci
136062306a36Sopenharmony_cistatic void
136162306a36Sopenharmony_cixfs_buf_ioend_async(
136262306a36Sopenharmony_ci	struct xfs_buf	*bp)
136362306a36Sopenharmony_ci{
136462306a36Sopenharmony_ci	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
136562306a36Sopenharmony_ci	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
136662306a36Sopenharmony_ci}
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_civoid
136962306a36Sopenharmony_ci__xfs_buf_ioerror(
137062306a36Sopenharmony_ci	struct xfs_buf		*bp,
137162306a36Sopenharmony_ci	int			error,
137262306a36Sopenharmony_ci	xfs_failaddr_t		failaddr)
137362306a36Sopenharmony_ci{
137462306a36Sopenharmony_ci	ASSERT(error <= 0 && error >= -1000);
137562306a36Sopenharmony_ci	bp->b_error = error;
137662306a36Sopenharmony_ci	trace_xfs_buf_ioerror(bp, error, failaddr);
137762306a36Sopenharmony_ci}
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_civoid
138062306a36Sopenharmony_cixfs_buf_ioerror_alert(
138162306a36Sopenharmony_ci	struct xfs_buf		*bp,
138262306a36Sopenharmony_ci	xfs_failaddr_t		func)
138362306a36Sopenharmony_ci{
138462306a36Sopenharmony_ci	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
138562306a36Sopenharmony_ci		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
138662306a36Sopenharmony_ci				  func, (uint64_t)xfs_buf_daddr(bp),
138762306a36Sopenharmony_ci				  bp->b_length, -bp->b_error);
138862306a36Sopenharmony_ci}
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci/*
139162306a36Sopenharmony_ci * To simulate an I/O failure, the buffer must be locked and held with at least
139262306a36Sopenharmony_ci * three references. The LRU reference is dropped by the stale call. The buf
139362306a36Sopenharmony_ci * item reference is dropped via ioend processing. The third reference is owned
139462306a36Sopenharmony_ci * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
139562306a36Sopenharmony_ci */
139662306a36Sopenharmony_civoid
139762306a36Sopenharmony_cixfs_buf_ioend_fail(
139862306a36Sopenharmony_ci	struct xfs_buf	*bp)
139962306a36Sopenharmony_ci{
140062306a36Sopenharmony_ci	bp->b_flags &= ~XBF_DONE;
140162306a36Sopenharmony_ci	xfs_buf_stale(bp);
140262306a36Sopenharmony_ci	xfs_buf_ioerror(bp, -EIO);
140362306a36Sopenharmony_ci	xfs_buf_ioend(bp);
140462306a36Sopenharmony_ci}
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ciint
140762306a36Sopenharmony_cixfs_bwrite(
140862306a36Sopenharmony_ci	struct xfs_buf		*bp)
140962306a36Sopenharmony_ci{
141062306a36Sopenharmony_ci	int			error;
141162306a36Sopenharmony_ci
141262306a36Sopenharmony_ci	ASSERT(xfs_buf_islocked(bp));
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_ci	bp->b_flags |= XBF_WRITE;
141562306a36Sopenharmony_ci	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
141662306a36Sopenharmony_ci			 XBF_DONE);
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	error = xfs_buf_submit(bp);
141962306a36Sopenharmony_ci	if (error)
142062306a36Sopenharmony_ci		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
142162306a36Sopenharmony_ci	return error;
142262306a36Sopenharmony_ci}
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_cistatic void
142562306a36Sopenharmony_cixfs_buf_bio_end_io(
142662306a36Sopenharmony_ci	struct bio		*bio)
142762306a36Sopenharmony_ci{
142862306a36Sopenharmony_ci	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci	if (!bio->bi_status &&
143162306a36Sopenharmony_ci	    (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
143262306a36Sopenharmony_ci	    XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
143362306a36Sopenharmony_ci		bio->bi_status = BLK_STS_IOERR;
143462306a36Sopenharmony_ci
143562306a36Sopenharmony_ci	/*
143662306a36Sopenharmony_ci	 * don't overwrite existing errors - otherwise we can lose errors on
143762306a36Sopenharmony_ci	 * buffers that require multiple bios to complete.
143862306a36Sopenharmony_ci	 */
143962306a36Sopenharmony_ci	if (bio->bi_status) {
144062306a36Sopenharmony_ci		int error = blk_status_to_errno(bio->bi_status);
144162306a36Sopenharmony_ci
144262306a36Sopenharmony_ci		cmpxchg(&bp->b_io_error, 0, error);
144362306a36Sopenharmony_ci	}
144462306a36Sopenharmony_ci
144562306a36Sopenharmony_ci	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
144662306a36Sopenharmony_ci		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
144762306a36Sopenharmony_ci
144862306a36Sopenharmony_ci	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
144962306a36Sopenharmony_ci		xfs_buf_ioend_async(bp);
145062306a36Sopenharmony_ci	bio_put(bio);
145162306a36Sopenharmony_ci}
145262306a36Sopenharmony_ci
145362306a36Sopenharmony_cistatic void
145462306a36Sopenharmony_cixfs_buf_ioapply_map(
145562306a36Sopenharmony_ci	struct xfs_buf	*bp,
145662306a36Sopenharmony_ci	int		map,
145762306a36Sopenharmony_ci	int		*buf_offset,
145862306a36Sopenharmony_ci	int		*count,
145962306a36Sopenharmony_ci	blk_opf_t	op)
146062306a36Sopenharmony_ci{
146162306a36Sopenharmony_ci	int		page_index;
146262306a36Sopenharmony_ci	unsigned int	total_nr_pages = bp->b_page_count;
146362306a36Sopenharmony_ci	int		nr_pages;
146462306a36Sopenharmony_ci	struct bio	*bio;
146562306a36Sopenharmony_ci	sector_t	sector =  bp->b_maps[map].bm_bn;
146662306a36Sopenharmony_ci	int		size;
146762306a36Sopenharmony_ci	int		offset;
146862306a36Sopenharmony_ci
146962306a36Sopenharmony_ci	/* skip the pages in the buffer before the start offset */
147062306a36Sopenharmony_ci	page_index = 0;
147162306a36Sopenharmony_ci	offset = *buf_offset;
147262306a36Sopenharmony_ci	while (offset >= PAGE_SIZE) {
147362306a36Sopenharmony_ci		page_index++;
147462306a36Sopenharmony_ci		offset -= PAGE_SIZE;
147562306a36Sopenharmony_ci	}
147662306a36Sopenharmony_ci
147762306a36Sopenharmony_ci	/*
147862306a36Sopenharmony_ci	 * Limit the IO size to the length of the current vector, and update the
147962306a36Sopenharmony_ci	 * remaining IO count for the next time around.
148062306a36Sopenharmony_ci	 */
148162306a36Sopenharmony_ci	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
148262306a36Sopenharmony_ci	*count -= size;
148362306a36Sopenharmony_ci	*buf_offset += size;
148462306a36Sopenharmony_ci
148562306a36Sopenharmony_cinext_chunk:
148662306a36Sopenharmony_ci	atomic_inc(&bp->b_io_remaining);
148762306a36Sopenharmony_ci	nr_pages = bio_max_segs(total_nr_pages);
148862306a36Sopenharmony_ci
148962306a36Sopenharmony_ci	bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
149062306a36Sopenharmony_ci	bio->bi_iter.bi_sector = sector;
149162306a36Sopenharmony_ci	bio->bi_end_io = xfs_buf_bio_end_io;
149262306a36Sopenharmony_ci	bio->bi_private = bp;
149362306a36Sopenharmony_ci
149462306a36Sopenharmony_ci	for (; size && nr_pages; nr_pages--, page_index++) {
149562306a36Sopenharmony_ci		int	rbytes, nbytes = PAGE_SIZE - offset;
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_ci		if (nbytes > size)
149862306a36Sopenharmony_ci			nbytes = size;
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
150162306a36Sopenharmony_ci				      offset);
150262306a36Sopenharmony_ci		if (rbytes < nbytes)
150362306a36Sopenharmony_ci			break;
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_ci		offset = 0;
150662306a36Sopenharmony_ci		sector += BTOBB(nbytes);
150762306a36Sopenharmony_ci		size -= nbytes;
150862306a36Sopenharmony_ci		total_nr_pages--;
150962306a36Sopenharmony_ci	}
151062306a36Sopenharmony_ci
151162306a36Sopenharmony_ci	if (likely(bio->bi_iter.bi_size)) {
151262306a36Sopenharmony_ci		if (xfs_buf_is_vmapped(bp)) {
151362306a36Sopenharmony_ci			flush_kernel_vmap_range(bp->b_addr,
151462306a36Sopenharmony_ci						xfs_buf_vmap_len(bp));
151562306a36Sopenharmony_ci		}
151662306a36Sopenharmony_ci		submit_bio(bio);
151762306a36Sopenharmony_ci		if (size)
151862306a36Sopenharmony_ci			goto next_chunk;
151962306a36Sopenharmony_ci	} else {
152062306a36Sopenharmony_ci		/*
152162306a36Sopenharmony_ci		 * This is guaranteed not to be the last io reference count
152262306a36Sopenharmony_ci		 * because the caller (xfs_buf_submit) holds a count itself.
152362306a36Sopenharmony_ci		 */
152462306a36Sopenharmony_ci		atomic_dec(&bp->b_io_remaining);
152562306a36Sopenharmony_ci		xfs_buf_ioerror(bp, -EIO);
152662306a36Sopenharmony_ci		bio_put(bio);
152762306a36Sopenharmony_ci	}
152862306a36Sopenharmony_ci
152962306a36Sopenharmony_ci}
153062306a36Sopenharmony_ci
153162306a36Sopenharmony_ciSTATIC void
153262306a36Sopenharmony_ci_xfs_buf_ioapply(
153362306a36Sopenharmony_ci	struct xfs_buf	*bp)
153462306a36Sopenharmony_ci{
153562306a36Sopenharmony_ci	struct blk_plug	plug;
153662306a36Sopenharmony_ci	blk_opf_t	op;
153762306a36Sopenharmony_ci	int		offset;
153862306a36Sopenharmony_ci	int		size;
153962306a36Sopenharmony_ci	int		i;
154062306a36Sopenharmony_ci
154162306a36Sopenharmony_ci	/*
154262306a36Sopenharmony_ci	 * Make sure we capture only current IO errors rather than stale errors
154362306a36Sopenharmony_ci	 * left over from previous use of the buffer (e.g. failed readahead).
154462306a36Sopenharmony_ci	 */
154562306a36Sopenharmony_ci	bp->b_error = 0;
154662306a36Sopenharmony_ci
154762306a36Sopenharmony_ci	if (bp->b_flags & XBF_WRITE) {
154862306a36Sopenharmony_ci		op = REQ_OP_WRITE;
154962306a36Sopenharmony_ci
155062306a36Sopenharmony_ci		/*
155162306a36Sopenharmony_ci		 * Run the write verifier callback function if it exists. If
155262306a36Sopenharmony_ci		 * this function fails it will mark the buffer with an error and
155362306a36Sopenharmony_ci		 * the IO should not be dispatched.
155462306a36Sopenharmony_ci		 */
155562306a36Sopenharmony_ci		if (bp->b_ops) {
155662306a36Sopenharmony_ci			bp->b_ops->verify_write(bp);
155762306a36Sopenharmony_ci			if (bp->b_error) {
155862306a36Sopenharmony_ci				xfs_force_shutdown(bp->b_mount,
155962306a36Sopenharmony_ci						   SHUTDOWN_CORRUPT_INCORE);
156062306a36Sopenharmony_ci				return;
156162306a36Sopenharmony_ci			}
156262306a36Sopenharmony_ci		} else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
156362306a36Sopenharmony_ci			struct xfs_mount *mp = bp->b_mount;
156462306a36Sopenharmony_ci
156562306a36Sopenharmony_ci			/*
156662306a36Sopenharmony_ci			 * non-crc filesystems don't attach verifiers during
156762306a36Sopenharmony_ci			 * log recovery, so don't warn for such filesystems.
156862306a36Sopenharmony_ci			 */
156962306a36Sopenharmony_ci			if (xfs_has_crc(mp)) {
157062306a36Sopenharmony_ci				xfs_warn(mp,
157162306a36Sopenharmony_ci					"%s: no buf ops on daddr 0x%llx len %d",
157262306a36Sopenharmony_ci					__func__, xfs_buf_daddr(bp),
157362306a36Sopenharmony_ci					bp->b_length);
157462306a36Sopenharmony_ci				xfs_hex_dump(bp->b_addr,
157562306a36Sopenharmony_ci						XFS_CORRUPTION_DUMP_LEN);
157662306a36Sopenharmony_ci				dump_stack();
157762306a36Sopenharmony_ci			}
157862306a36Sopenharmony_ci		}
157962306a36Sopenharmony_ci	} else {
158062306a36Sopenharmony_ci		op = REQ_OP_READ;
158162306a36Sopenharmony_ci		if (bp->b_flags & XBF_READ_AHEAD)
158262306a36Sopenharmony_ci			op |= REQ_RAHEAD;
158362306a36Sopenharmony_ci	}
158462306a36Sopenharmony_ci
158562306a36Sopenharmony_ci	/* we only use the buffer cache for meta-data */
158662306a36Sopenharmony_ci	op |= REQ_META;
158762306a36Sopenharmony_ci
158862306a36Sopenharmony_ci	/*
158962306a36Sopenharmony_ci	 * Walk all the vectors issuing IO on them. Set up the initial offset
159062306a36Sopenharmony_ci	 * into the buffer and the desired IO size before we start -
159162306a36Sopenharmony_ci	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
159262306a36Sopenharmony_ci	 * subsequent call.
159362306a36Sopenharmony_ci	 */
159462306a36Sopenharmony_ci	offset = bp->b_offset;
159562306a36Sopenharmony_ci	size = BBTOB(bp->b_length);
159662306a36Sopenharmony_ci	blk_start_plug(&plug);
159762306a36Sopenharmony_ci	for (i = 0; i < bp->b_map_count; i++) {
159862306a36Sopenharmony_ci		xfs_buf_ioapply_map(bp, i, &offset, &size, op);
159962306a36Sopenharmony_ci		if (bp->b_error)
160062306a36Sopenharmony_ci			break;
160162306a36Sopenharmony_ci		if (size <= 0)
160262306a36Sopenharmony_ci			break;	/* all done */
160362306a36Sopenharmony_ci	}
160462306a36Sopenharmony_ci	blk_finish_plug(&plug);
160562306a36Sopenharmony_ci}
160662306a36Sopenharmony_ci
160762306a36Sopenharmony_ci/*
160862306a36Sopenharmony_ci * Wait for I/O completion of a sync buffer and return the I/O error code.
160962306a36Sopenharmony_ci */
161062306a36Sopenharmony_cistatic int
161162306a36Sopenharmony_cixfs_buf_iowait(
161262306a36Sopenharmony_ci	struct xfs_buf	*bp)
161362306a36Sopenharmony_ci{
161462306a36Sopenharmony_ci	ASSERT(!(bp->b_flags & XBF_ASYNC));
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	trace_xfs_buf_iowait(bp, _RET_IP_);
161762306a36Sopenharmony_ci	wait_for_completion(&bp->b_iowait);
161862306a36Sopenharmony_ci	trace_xfs_buf_iowait_done(bp, _RET_IP_);
161962306a36Sopenharmony_ci
162062306a36Sopenharmony_ci	return bp->b_error;
162162306a36Sopenharmony_ci}
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci/*
162462306a36Sopenharmony_ci * Buffer I/O submission path, read or write. Asynchronous submission transfers
162562306a36Sopenharmony_ci * the buffer lock ownership and the current reference to the IO. It is not
162662306a36Sopenharmony_ci * safe to reference the buffer after a call to this function unless the caller
162762306a36Sopenharmony_ci * holds an additional reference itself.
162862306a36Sopenharmony_ci */
162962306a36Sopenharmony_cistatic int
163062306a36Sopenharmony_ci__xfs_buf_submit(
163162306a36Sopenharmony_ci	struct xfs_buf	*bp,
163262306a36Sopenharmony_ci	bool		wait)
163362306a36Sopenharmony_ci{
163462306a36Sopenharmony_ci	int		error = 0;
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci	trace_xfs_buf_submit(bp, _RET_IP_);
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_ci	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	/*
164162306a36Sopenharmony_ci	 * On log shutdown we stale and complete the buffer immediately. We can
164262306a36Sopenharmony_ci	 * be called to read the superblock before the log has been set up, so
164362306a36Sopenharmony_ci	 * be careful checking the log state.
164462306a36Sopenharmony_ci	 *
164562306a36Sopenharmony_ci	 * Checking the mount shutdown state here can result in the log tail
164662306a36Sopenharmony_ci	 * moving inappropriately on disk as the log may not yet be shut down.
164762306a36Sopenharmony_ci	 * i.e. failing this buffer on mount shutdown can remove it from the AIL
164862306a36Sopenharmony_ci	 * and move the tail of the log forwards without having written this
164962306a36Sopenharmony_ci	 * buffer to disk. This corrupts the log tail state in memory, and
165062306a36Sopenharmony_ci	 * because the log may not be shut down yet, it can then be propagated
165162306a36Sopenharmony_ci	 * to disk before the log is shutdown. Hence we check log shutdown
165262306a36Sopenharmony_ci	 * state here rather than mount state to avoid corrupting the log tail
165362306a36Sopenharmony_ci	 * on shutdown.
165462306a36Sopenharmony_ci	 */
165562306a36Sopenharmony_ci	if (bp->b_mount->m_log &&
165662306a36Sopenharmony_ci	    xlog_is_shutdown(bp->b_mount->m_log)) {
165762306a36Sopenharmony_ci		xfs_buf_ioend_fail(bp);
165862306a36Sopenharmony_ci		return -EIO;
165962306a36Sopenharmony_ci	}
166062306a36Sopenharmony_ci
166162306a36Sopenharmony_ci	/*
166262306a36Sopenharmony_ci	 * Grab a reference so the buffer does not go away underneath us. For
166362306a36Sopenharmony_ci	 * async buffers, I/O completion drops the callers reference, which
166462306a36Sopenharmony_ci	 * could occur before submission returns.
166562306a36Sopenharmony_ci	 */
166662306a36Sopenharmony_ci	xfs_buf_hold(bp);
166762306a36Sopenharmony_ci
166862306a36Sopenharmony_ci	if (bp->b_flags & XBF_WRITE)
166962306a36Sopenharmony_ci		xfs_buf_wait_unpin(bp);
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci	/* clear the internal error state to avoid spurious errors */
167262306a36Sopenharmony_ci	bp->b_io_error = 0;
167362306a36Sopenharmony_ci
167462306a36Sopenharmony_ci	/*
167562306a36Sopenharmony_ci	 * Set the count to 1 initially, this will stop an I/O completion
167662306a36Sopenharmony_ci	 * callout which happens before we have started all the I/O from calling
167762306a36Sopenharmony_ci	 * xfs_buf_ioend too early.
167862306a36Sopenharmony_ci	 */
167962306a36Sopenharmony_ci	atomic_set(&bp->b_io_remaining, 1);
168062306a36Sopenharmony_ci	if (bp->b_flags & XBF_ASYNC)
168162306a36Sopenharmony_ci		xfs_buf_ioacct_inc(bp);
168262306a36Sopenharmony_ci	_xfs_buf_ioapply(bp);
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci	/*
168562306a36Sopenharmony_ci	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
168662306a36Sopenharmony_ci	 * reference we took above. If we drop it to zero, run completion so
168762306a36Sopenharmony_ci	 * that we don't return to the caller with completion still pending.
168862306a36Sopenharmony_ci	 */
168962306a36Sopenharmony_ci	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
169062306a36Sopenharmony_ci		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
169162306a36Sopenharmony_ci			xfs_buf_ioend(bp);
169262306a36Sopenharmony_ci		else
169362306a36Sopenharmony_ci			xfs_buf_ioend_async(bp);
169462306a36Sopenharmony_ci	}
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ci	if (wait)
169762306a36Sopenharmony_ci		error = xfs_buf_iowait(bp);
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci	/*
170062306a36Sopenharmony_ci	 * Release the hold that keeps the buffer referenced for the entire
170162306a36Sopenharmony_ci	 * I/O. Note that if the buffer is async, it is not safe to reference
170262306a36Sopenharmony_ci	 * after this release.
170362306a36Sopenharmony_ci	 */
170462306a36Sopenharmony_ci	xfs_buf_rele(bp);
170562306a36Sopenharmony_ci	return error;
170662306a36Sopenharmony_ci}
170762306a36Sopenharmony_ci
170862306a36Sopenharmony_civoid *
170962306a36Sopenharmony_cixfs_buf_offset(
171062306a36Sopenharmony_ci	struct xfs_buf		*bp,
171162306a36Sopenharmony_ci	size_t			offset)
171262306a36Sopenharmony_ci{
171362306a36Sopenharmony_ci	struct page		*page;
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_ci	if (bp->b_addr)
171662306a36Sopenharmony_ci		return bp->b_addr + offset;
171762306a36Sopenharmony_ci
171862306a36Sopenharmony_ci	page = bp->b_pages[offset >> PAGE_SHIFT];
171962306a36Sopenharmony_ci	return page_address(page) + (offset & (PAGE_SIZE-1));
172062306a36Sopenharmony_ci}
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_civoid
172362306a36Sopenharmony_cixfs_buf_zero(
172462306a36Sopenharmony_ci	struct xfs_buf		*bp,
172562306a36Sopenharmony_ci	size_t			boff,
172662306a36Sopenharmony_ci	size_t			bsize)
172762306a36Sopenharmony_ci{
172862306a36Sopenharmony_ci	size_t			bend;
172962306a36Sopenharmony_ci
173062306a36Sopenharmony_ci	bend = boff + bsize;
173162306a36Sopenharmony_ci	while (boff < bend) {
173262306a36Sopenharmony_ci		struct page	*page;
173362306a36Sopenharmony_ci		int		page_index, page_offset, csize;
173462306a36Sopenharmony_ci
173562306a36Sopenharmony_ci		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
173662306a36Sopenharmony_ci		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
173762306a36Sopenharmony_ci		page = bp->b_pages[page_index];
173862306a36Sopenharmony_ci		csize = min_t(size_t, PAGE_SIZE - page_offset,
173962306a36Sopenharmony_ci				      BBTOB(bp->b_length) - boff);
174062306a36Sopenharmony_ci
174162306a36Sopenharmony_ci		ASSERT((csize + page_offset) <= PAGE_SIZE);
174262306a36Sopenharmony_ci
174362306a36Sopenharmony_ci		memset(page_address(page) + page_offset, 0, csize);
174462306a36Sopenharmony_ci
174562306a36Sopenharmony_ci		boff += csize;
174662306a36Sopenharmony_ci	}
174762306a36Sopenharmony_ci}
174862306a36Sopenharmony_ci
174962306a36Sopenharmony_ci/*
175062306a36Sopenharmony_ci * Log a message about and stale a buffer that a caller has decided is corrupt.
175162306a36Sopenharmony_ci *
175262306a36Sopenharmony_ci * This function should be called for the kinds of metadata corruption that
175362306a36Sopenharmony_ci * cannot be detect from a verifier, such as incorrect inter-block relationship
175462306a36Sopenharmony_ci * data.  Do /not/ call this function from a verifier function.
175562306a36Sopenharmony_ci *
175662306a36Sopenharmony_ci * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
175762306a36Sopenharmony_ci * be marked stale, but b_error will not be set.  The caller is responsible for
175862306a36Sopenharmony_ci * releasing the buffer or fixing it.
175962306a36Sopenharmony_ci */
176062306a36Sopenharmony_civoid
176162306a36Sopenharmony_ci__xfs_buf_mark_corrupt(
176262306a36Sopenharmony_ci	struct xfs_buf		*bp,
176362306a36Sopenharmony_ci	xfs_failaddr_t		fa)
176462306a36Sopenharmony_ci{
176562306a36Sopenharmony_ci	ASSERT(bp->b_flags & XBF_DONE);
176662306a36Sopenharmony_ci
176762306a36Sopenharmony_ci	xfs_buf_corruption_error(bp, fa);
176862306a36Sopenharmony_ci	xfs_buf_stale(bp);
176962306a36Sopenharmony_ci}
177062306a36Sopenharmony_ci
177162306a36Sopenharmony_ci/*
177262306a36Sopenharmony_ci *	Handling of buffer targets (buftargs).
177362306a36Sopenharmony_ci */
177462306a36Sopenharmony_ci
177562306a36Sopenharmony_ci/*
177662306a36Sopenharmony_ci * Wait for any bufs with callbacks that have been submitted but have not yet
177762306a36Sopenharmony_ci * returned. These buffers will have an elevated hold count, so wait on those
177862306a36Sopenharmony_ci * while freeing all the buffers only held by the LRU.
177962306a36Sopenharmony_ci */
178062306a36Sopenharmony_cistatic enum lru_status
178162306a36Sopenharmony_cixfs_buftarg_drain_rele(
178262306a36Sopenharmony_ci	struct list_head	*item,
178362306a36Sopenharmony_ci	struct list_lru_one	*lru,
178462306a36Sopenharmony_ci	spinlock_t		*lru_lock,
178562306a36Sopenharmony_ci	void			*arg)
178662306a36Sopenharmony_ci
178762306a36Sopenharmony_ci{
178862306a36Sopenharmony_ci	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
178962306a36Sopenharmony_ci	struct list_head	*dispose = arg;
179062306a36Sopenharmony_ci
179162306a36Sopenharmony_ci	if (atomic_read(&bp->b_hold) > 1) {
179262306a36Sopenharmony_ci		/* need to wait, so skip it this pass */
179362306a36Sopenharmony_ci		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
179462306a36Sopenharmony_ci		return LRU_SKIP;
179562306a36Sopenharmony_ci	}
179662306a36Sopenharmony_ci	if (!spin_trylock(&bp->b_lock))
179762306a36Sopenharmony_ci		return LRU_SKIP;
179862306a36Sopenharmony_ci
179962306a36Sopenharmony_ci	/*
180062306a36Sopenharmony_ci	 * clear the LRU reference count so the buffer doesn't get
180162306a36Sopenharmony_ci	 * ignored in xfs_buf_rele().
180262306a36Sopenharmony_ci	 */
180362306a36Sopenharmony_ci	atomic_set(&bp->b_lru_ref, 0);
180462306a36Sopenharmony_ci	bp->b_state |= XFS_BSTATE_DISPOSE;
180562306a36Sopenharmony_ci	list_lru_isolate_move(lru, item, dispose);
180662306a36Sopenharmony_ci	spin_unlock(&bp->b_lock);
180762306a36Sopenharmony_ci	return LRU_REMOVED;
180862306a36Sopenharmony_ci}
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci/*
181162306a36Sopenharmony_ci * Wait for outstanding I/O on the buftarg to complete.
181262306a36Sopenharmony_ci */
181362306a36Sopenharmony_civoid
181462306a36Sopenharmony_cixfs_buftarg_wait(
181562306a36Sopenharmony_ci	struct xfs_buftarg	*btp)
181662306a36Sopenharmony_ci{
181762306a36Sopenharmony_ci	/*
181862306a36Sopenharmony_ci	 * First wait on the buftarg I/O count for all in-flight buffers to be
181962306a36Sopenharmony_ci	 * released. This is critical as new buffers do not make the LRU until
182062306a36Sopenharmony_ci	 * they are released.
182162306a36Sopenharmony_ci	 *
182262306a36Sopenharmony_ci	 * Next, flush the buffer workqueue to ensure all completion processing
182362306a36Sopenharmony_ci	 * has finished. Just waiting on buffer locks is not sufficient for
182462306a36Sopenharmony_ci	 * async IO as the reference count held over IO is not released until
182562306a36Sopenharmony_ci	 * after the buffer lock is dropped. Hence we need to ensure here that
182662306a36Sopenharmony_ci	 * all reference counts have been dropped before we start walking the
182762306a36Sopenharmony_ci	 * LRU list.
182862306a36Sopenharmony_ci	 */
182962306a36Sopenharmony_ci	while (percpu_counter_sum(&btp->bt_io_count))
183062306a36Sopenharmony_ci		delay(100);
183162306a36Sopenharmony_ci	flush_workqueue(btp->bt_mount->m_buf_workqueue);
183262306a36Sopenharmony_ci}
183362306a36Sopenharmony_ci
183462306a36Sopenharmony_civoid
183562306a36Sopenharmony_cixfs_buftarg_drain(
183662306a36Sopenharmony_ci	struct xfs_buftarg	*btp)
183762306a36Sopenharmony_ci{
183862306a36Sopenharmony_ci	LIST_HEAD(dispose);
183962306a36Sopenharmony_ci	int			loop = 0;
184062306a36Sopenharmony_ci	bool			write_fail = false;
184162306a36Sopenharmony_ci
184262306a36Sopenharmony_ci	xfs_buftarg_wait(btp);
184362306a36Sopenharmony_ci
184462306a36Sopenharmony_ci	/* loop until there is nothing left on the lru list. */
184562306a36Sopenharmony_ci	while (list_lru_count(&btp->bt_lru)) {
184662306a36Sopenharmony_ci		list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
184762306a36Sopenharmony_ci			      &dispose, LONG_MAX);
184862306a36Sopenharmony_ci
184962306a36Sopenharmony_ci		while (!list_empty(&dispose)) {
185062306a36Sopenharmony_ci			struct xfs_buf *bp;
185162306a36Sopenharmony_ci			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
185262306a36Sopenharmony_ci			list_del_init(&bp->b_lru);
185362306a36Sopenharmony_ci			if (bp->b_flags & XBF_WRITE_FAIL) {
185462306a36Sopenharmony_ci				write_fail = true;
185562306a36Sopenharmony_ci				xfs_buf_alert_ratelimited(bp,
185662306a36Sopenharmony_ci					"XFS: Corruption Alert",
185762306a36Sopenharmony_ci"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
185862306a36Sopenharmony_ci					(long long)xfs_buf_daddr(bp));
185962306a36Sopenharmony_ci			}
186062306a36Sopenharmony_ci			xfs_buf_rele(bp);
186162306a36Sopenharmony_ci		}
186262306a36Sopenharmony_ci		if (loop++ != 0)
186362306a36Sopenharmony_ci			delay(100);
186462306a36Sopenharmony_ci	}
186562306a36Sopenharmony_ci
186662306a36Sopenharmony_ci	/*
186762306a36Sopenharmony_ci	 * If one or more failed buffers were freed, that means dirty metadata
186862306a36Sopenharmony_ci	 * was thrown away. This should only ever happen after I/O completion
186962306a36Sopenharmony_ci	 * handling has elevated I/O error(s) to permanent failures and shuts
187062306a36Sopenharmony_ci	 * down the journal.
187162306a36Sopenharmony_ci	 */
187262306a36Sopenharmony_ci	if (write_fail) {
187362306a36Sopenharmony_ci		ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
187462306a36Sopenharmony_ci		xfs_alert(btp->bt_mount,
187562306a36Sopenharmony_ci	      "Please run xfs_repair to determine the extent of the problem.");
187662306a36Sopenharmony_ci	}
187762306a36Sopenharmony_ci}
187862306a36Sopenharmony_ci
187962306a36Sopenharmony_cistatic enum lru_status
188062306a36Sopenharmony_cixfs_buftarg_isolate(
188162306a36Sopenharmony_ci	struct list_head	*item,
188262306a36Sopenharmony_ci	struct list_lru_one	*lru,
188362306a36Sopenharmony_ci	spinlock_t		*lru_lock,
188462306a36Sopenharmony_ci	void			*arg)
188562306a36Sopenharmony_ci{
188662306a36Sopenharmony_ci	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
188762306a36Sopenharmony_ci	struct list_head	*dispose = arg;
188862306a36Sopenharmony_ci
188962306a36Sopenharmony_ci	/*
189062306a36Sopenharmony_ci	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
189162306a36Sopenharmony_ci	 * If we fail to get the lock, just skip it.
189262306a36Sopenharmony_ci	 */
189362306a36Sopenharmony_ci	if (!spin_trylock(&bp->b_lock))
189462306a36Sopenharmony_ci		return LRU_SKIP;
189562306a36Sopenharmony_ci	/*
189662306a36Sopenharmony_ci	 * Decrement the b_lru_ref count unless the value is already
189762306a36Sopenharmony_ci	 * zero. If the value is already zero, we need to reclaim the
189862306a36Sopenharmony_ci	 * buffer, otherwise it gets another trip through the LRU.
189962306a36Sopenharmony_ci	 */
190062306a36Sopenharmony_ci	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
190162306a36Sopenharmony_ci		spin_unlock(&bp->b_lock);
190262306a36Sopenharmony_ci		return LRU_ROTATE;
190362306a36Sopenharmony_ci	}
190462306a36Sopenharmony_ci
190562306a36Sopenharmony_ci	bp->b_state |= XFS_BSTATE_DISPOSE;
190662306a36Sopenharmony_ci	list_lru_isolate_move(lru, item, dispose);
190762306a36Sopenharmony_ci	spin_unlock(&bp->b_lock);
190862306a36Sopenharmony_ci	return LRU_REMOVED;
190962306a36Sopenharmony_ci}
191062306a36Sopenharmony_ci
191162306a36Sopenharmony_cistatic unsigned long
191262306a36Sopenharmony_cixfs_buftarg_shrink_scan(
191362306a36Sopenharmony_ci	struct shrinker		*shrink,
191462306a36Sopenharmony_ci	struct shrink_control	*sc)
191562306a36Sopenharmony_ci{
191662306a36Sopenharmony_ci	struct xfs_buftarg	*btp = container_of(shrink,
191762306a36Sopenharmony_ci					struct xfs_buftarg, bt_shrinker);
191862306a36Sopenharmony_ci	LIST_HEAD(dispose);
191962306a36Sopenharmony_ci	unsigned long		freed;
192062306a36Sopenharmony_ci
192162306a36Sopenharmony_ci	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
192262306a36Sopenharmony_ci				     xfs_buftarg_isolate, &dispose);
192362306a36Sopenharmony_ci
192462306a36Sopenharmony_ci	while (!list_empty(&dispose)) {
192562306a36Sopenharmony_ci		struct xfs_buf *bp;
192662306a36Sopenharmony_ci		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
192762306a36Sopenharmony_ci		list_del_init(&bp->b_lru);
192862306a36Sopenharmony_ci		xfs_buf_rele(bp);
192962306a36Sopenharmony_ci	}
193062306a36Sopenharmony_ci
193162306a36Sopenharmony_ci	return freed;
193262306a36Sopenharmony_ci}
193362306a36Sopenharmony_ci
193462306a36Sopenharmony_cistatic unsigned long
193562306a36Sopenharmony_cixfs_buftarg_shrink_count(
193662306a36Sopenharmony_ci	struct shrinker		*shrink,
193762306a36Sopenharmony_ci	struct shrink_control	*sc)
193862306a36Sopenharmony_ci{
193962306a36Sopenharmony_ci	struct xfs_buftarg	*btp = container_of(shrink,
194062306a36Sopenharmony_ci					struct xfs_buftarg, bt_shrinker);
194162306a36Sopenharmony_ci	return list_lru_shrink_count(&btp->bt_lru, sc);
194262306a36Sopenharmony_ci}
194362306a36Sopenharmony_ci
194462306a36Sopenharmony_civoid
194562306a36Sopenharmony_cixfs_free_buftarg(
194662306a36Sopenharmony_ci	struct xfs_buftarg	*btp)
194762306a36Sopenharmony_ci{
194862306a36Sopenharmony_ci	struct block_device	*bdev = btp->bt_bdev;
194962306a36Sopenharmony_ci
195062306a36Sopenharmony_ci	unregister_shrinker(&btp->bt_shrinker);
195162306a36Sopenharmony_ci	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
195262306a36Sopenharmony_ci	percpu_counter_destroy(&btp->bt_io_count);
195362306a36Sopenharmony_ci	list_lru_destroy(&btp->bt_lru);
195462306a36Sopenharmony_ci
195562306a36Sopenharmony_ci	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
195662306a36Sopenharmony_ci	/* the main block device is closed by kill_block_super */
195762306a36Sopenharmony_ci	if (bdev != btp->bt_mount->m_super->s_bdev)
195862306a36Sopenharmony_ci		blkdev_put(bdev, btp->bt_mount->m_super);
195962306a36Sopenharmony_ci
196062306a36Sopenharmony_ci	kmem_free(btp);
196162306a36Sopenharmony_ci}
196262306a36Sopenharmony_ci
196362306a36Sopenharmony_ciint
196462306a36Sopenharmony_cixfs_setsize_buftarg(
196562306a36Sopenharmony_ci	xfs_buftarg_t		*btp,
196662306a36Sopenharmony_ci	unsigned int		sectorsize)
196762306a36Sopenharmony_ci{
196862306a36Sopenharmony_ci	/* Set up metadata sector size info */
196962306a36Sopenharmony_ci	btp->bt_meta_sectorsize = sectorsize;
197062306a36Sopenharmony_ci	btp->bt_meta_sectormask = sectorsize - 1;
197162306a36Sopenharmony_ci
197262306a36Sopenharmony_ci	if (set_blocksize(btp->bt_bdev, sectorsize)) {
197362306a36Sopenharmony_ci		xfs_warn(btp->bt_mount,
197462306a36Sopenharmony_ci			"Cannot set_blocksize to %u on device %pg",
197562306a36Sopenharmony_ci			sectorsize, btp->bt_bdev);
197662306a36Sopenharmony_ci		return -EINVAL;
197762306a36Sopenharmony_ci	}
197862306a36Sopenharmony_ci
197962306a36Sopenharmony_ci	/* Set up device logical sector size mask */
198062306a36Sopenharmony_ci	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
198162306a36Sopenharmony_ci	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
198262306a36Sopenharmony_ci
198362306a36Sopenharmony_ci	return 0;
198462306a36Sopenharmony_ci}
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_ci/*
198762306a36Sopenharmony_ci * When allocating the initial buffer target we have not yet
198862306a36Sopenharmony_ci * read in the superblock, so don't know what sized sectors
198962306a36Sopenharmony_ci * are being used at this early stage.  Play safe.
199062306a36Sopenharmony_ci */
199162306a36Sopenharmony_ciSTATIC int
199262306a36Sopenharmony_cixfs_setsize_buftarg_early(
199362306a36Sopenharmony_ci	xfs_buftarg_t		*btp,
199462306a36Sopenharmony_ci	struct block_device	*bdev)
199562306a36Sopenharmony_ci{
199662306a36Sopenharmony_ci	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
199762306a36Sopenharmony_ci}
199862306a36Sopenharmony_ci
199962306a36Sopenharmony_cistruct xfs_buftarg *
200062306a36Sopenharmony_cixfs_alloc_buftarg(
200162306a36Sopenharmony_ci	struct xfs_mount	*mp,
200262306a36Sopenharmony_ci	struct block_device	*bdev)
200362306a36Sopenharmony_ci{
200462306a36Sopenharmony_ci	xfs_buftarg_t		*btp;
200562306a36Sopenharmony_ci	const struct dax_holder_operations *ops = NULL;
200662306a36Sopenharmony_ci
200762306a36Sopenharmony_ci#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
200862306a36Sopenharmony_ci	ops = &xfs_dax_holder_operations;
200962306a36Sopenharmony_ci#endif
201062306a36Sopenharmony_ci	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
201162306a36Sopenharmony_ci
201262306a36Sopenharmony_ci	btp->bt_mount = mp;
201362306a36Sopenharmony_ci	btp->bt_dev =  bdev->bd_dev;
201462306a36Sopenharmony_ci	btp->bt_bdev = bdev;
201562306a36Sopenharmony_ci	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
201662306a36Sopenharmony_ci					    mp, ops);
201762306a36Sopenharmony_ci
201862306a36Sopenharmony_ci	/*
201962306a36Sopenharmony_ci	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
202062306a36Sopenharmony_ci	 * per 30 seconds so as to not spam logs too much on repeated errors.
202162306a36Sopenharmony_ci	 */
202262306a36Sopenharmony_ci	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
202362306a36Sopenharmony_ci			     DEFAULT_RATELIMIT_BURST);
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_ci	if (xfs_setsize_buftarg_early(btp, bdev))
202662306a36Sopenharmony_ci		goto error_free;
202762306a36Sopenharmony_ci
202862306a36Sopenharmony_ci	if (list_lru_init(&btp->bt_lru))
202962306a36Sopenharmony_ci		goto error_free;
203062306a36Sopenharmony_ci
203162306a36Sopenharmony_ci	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
203262306a36Sopenharmony_ci		goto error_lru;
203362306a36Sopenharmony_ci
203462306a36Sopenharmony_ci	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
203562306a36Sopenharmony_ci	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
203662306a36Sopenharmony_ci	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
203762306a36Sopenharmony_ci	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
203862306a36Sopenharmony_ci	if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
203962306a36Sopenharmony_ci			      mp->m_super->s_id))
204062306a36Sopenharmony_ci		goto error_pcpu;
204162306a36Sopenharmony_ci	return btp;
204262306a36Sopenharmony_ci
204362306a36Sopenharmony_cierror_pcpu:
204462306a36Sopenharmony_ci	percpu_counter_destroy(&btp->bt_io_count);
204562306a36Sopenharmony_cierror_lru:
204662306a36Sopenharmony_ci	list_lru_destroy(&btp->bt_lru);
204762306a36Sopenharmony_cierror_free:
204862306a36Sopenharmony_ci	kmem_free(btp);
204962306a36Sopenharmony_ci	return NULL;
205062306a36Sopenharmony_ci}
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_ci/*
205362306a36Sopenharmony_ci * Cancel a delayed write list.
205462306a36Sopenharmony_ci *
205562306a36Sopenharmony_ci * Remove each buffer from the list, clear the delwri queue flag and drop the
205662306a36Sopenharmony_ci * associated buffer reference.
205762306a36Sopenharmony_ci */
205862306a36Sopenharmony_civoid
205962306a36Sopenharmony_cixfs_buf_delwri_cancel(
206062306a36Sopenharmony_ci	struct list_head	*list)
206162306a36Sopenharmony_ci{
206262306a36Sopenharmony_ci	struct xfs_buf		*bp;
206362306a36Sopenharmony_ci
206462306a36Sopenharmony_ci	while (!list_empty(list)) {
206562306a36Sopenharmony_ci		bp = list_first_entry(list, struct xfs_buf, b_list);
206662306a36Sopenharmony_ci
206762306a36Sopenharmony_ci		xfs_buf_lock(bp);
206862306a36Sopenharmony_ci		bp->b_flags &= ~_XBF_DELWRI_Q;
206962306a36Sopenharmony_ci		list_del_init(&bp->b_list);
207062306a36Sopenharmony_ci		xfs_buf_relse(bp);
207162306a36Sopenharmony_ci	}
207262306a36Sopenharmony_ci}
207362306a36Sopenharmony_ci
207462306a36Sopenharmony_ci/*
207562306a36Sopenharmony_ci * Add a buffer to the delayed write list.
207662306a36Sopenharmony_ci *
207762306a36Sopenharmony_ci * This queues a buffer for writeout if it hasn't already been.  Note that
207862306a36Sopenharmony_ci * neither this routine nor the buffer list submission functions perform
207962306a36Sopenharmony_ci * any internal synchronization.  It is expected that the lists are thread-local
208062306a36Sopenharmony_ci * to the callers.
208162306a36Sopenharmony_ci *
208262306a36Sopenharmony_ci * Returns true if we queued up the buffer, or false if it already had
208362306a36Sopenharmony_ci * been on the buffer list.
208462306a36Sopenharmony_ci */
208562306a36Sopenharmony_cibool
208662306a36Sopenharmony_cixfs_buf_delwri_queue(
208762306a36Sopenharmony_ci	struct xfs_buf		*bp,
208862306a36Sopenharmony_ci	struct list_head	*list)
208962306a36Sopenharmony_ci{
209062306a36Sopenharmony_ci	ASSERT(xfs_buf_islocked(bp));
209162306a36Sopenharmony_ci	ASSERT(!(bp->b_flags & XBF_READ));
209262306a36Sopenharmony_ci
209362306a36Sopenharmony_ci	/*
209462306a36Sopenharmony_ci	 * If the buffer is already marked delwri it already is queued up
209562306a36Sopenharmony_ci	 * by someone else for imediate writeout.  Just ignore it in that
209662306a36Sopenharmony_ci	 * case.
209762306a36Sopenharmony_ci	 */
209862306a36Sopenharmony_ci	if (bp->b_flags & _XBF_DELWRI_Q) {
209962306a36Sopenharmony_ci		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
210062306a36Sopenharmony_ci		return false;
210162306a36Sopenharmony_ci	}
210262306a36Sopenharmony_ci
210362306a36Sopenharmony_ci	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
210462306a36Sopenharmony_ci
210562306a36Sopenharmony_ci	/*
210662306a36Sopenharmony_ci	 * If a buffer gets written out synchronously or marked stale while it
210762306a36Sopenharmony_ci	 * is on a delwri list we lazily remove it. To do this, the other party
210862306a36Sopenharmony_ci	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
210962306a36Sopenharmony_ci	 * It remains referenced and on the list.  In a rare corner case it
211062306a36Sopenharmony_ci	 * might get readded to a delwri list after the synchronous writeout, in
211162306a36Sopenharmony_ci	 * which case we need just need to re-add the flag here.
211262306a36Sopenharmony_ci	 */
211362306a36Sopenharmony_ci	bp->b_flags |= _XBF_DELWRI_Q;
211462306a36Sopenharmony_ci	if (list_empty(&bp->b_list)) {
211562306a36Sopenharmony_ci		atomic_inc(&bp->b_hold);
211662306a36Sopenharmony_ci		list_add_tail(&bp->b_list, list);
211762306a36Sopenharmony_ci	}
211862306a36Sopenharmony_ci
211962306a36Sopenharmony_ci	return true;
212062306a36Sopenharmony_ci}
212162306a36Sopenharmony_ci
212262306a36Sopenharmony_ci/*
212362306a36Sopenharmony_ci * Compare function is more complex than it needs to be because
212462306a36Sopenharmony_ci * the return value is only 32 bits and we are doing comparisons
212562306a36Sopenharmony_ci * on 64 bit values
212662306a36Sopenharmony_ci */
212762306a36Sopenharmony_cistatic int
212862306a36Sopenharmony_cixfs_buf_cmp(
212962306a36Sopenharmony_ci	void			*priv,
213062306a36Sopenharmony_ci	const struct list_head	*a,
213162306a36Sopenharmony_ci	const struct list_head	*b)
213262306a36Sopenharmony_ci{
213362306a36Sopenharmony_ci	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
213462306a36Sopenharmony_ci	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
213562306a36Sopenharmony_ci	xfs_daddr_t		diff;
213662306a36Sopenharmony_ci
213762306a36Sopenharmony_ci	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
213862306a36Sopenharmony_ci	if (diff < 0)
213962306a36Sopenharmony_ci		return -1;
214062306a36Sopenharmony_ci	if (diff > 0)
214162306a36Sopenharmony_ci		return 1;
214262306a36Sopenharmony_ci	return 0;
214362306a36Sopenharmony_ci}
214462306a36Sopenharmony_ci
214562306a36Sopenharmony_ci/*
214662306a36Sopenharmony_ci * Submit buffers for write. If wait_list is specified, the buffers are
214762306a36Sopenharmony_ci * submitted using sync I/O and placed on the wait list such that the caller can
214862306a36Sopenharmony_ci * iowait each buffer. Otherwise async I/O is used and the buffers are released
214962306a36Sopenharmony_ci * at I/O completion time. In either case, buffers remain locked until I/O
215062306a36Sopenharmony_ci * completes and the buffer is released from the queue.
215162306a36Sopenharmony_ci */
215262306a36Sopenharmony_cistatic int
215362306a36Sopenharmony_cixfs_buf_delwri_submit_buffers(
215462306a36Sopenharmony_ci	struct list_head	*buffer_list,
215562306a36Sopenharmony_ci	struct list_head	*wait_list)
215662306a36Sopenharmony_ci{
215762306a36Sopenharmony_ci	struct xfs_buf		*bp, *n;
215862306a36Sopenharmony_ci	int			pinned = 0;
215962306a36Sopenharmony_ci	struct blk_plug		plug;
216062306a36Sopenharmony_ci
216162306a36Sopenharmony_ci	list_sort(NULL, buffer_list, xfs_buf_cmp);
216262306a36Sopenharmony_ci
216362306a36Sopenharmony_ci	blk_start_plug(&plug);
216462306a36Sopenharmony_ci	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
216562306a36Sopenharmony_ci		if (!wait_list) {
216662306a36Sopenharmony_ci			if (!xfs_buf_trylock(bp))
216762306a36Sopenharmony_ci				continue;
216862306a36Sopenharmony_ci			if (xfs_buf_ispinned(bp)) {
216962306a36Sopenharmony_ci				xfs_buf_unlock(bp);
217062306a36Sopenharmony_ci				pinned++;
217162306a36Sopenharmony_ci				continue;
217262306a36Sopenharmony_ci			}
217362306a36Sopenharmony_ci		} else {
217462306a36Sopenharmony_ci			xfs_buf_lock(bp);
217562306a36Sopenharmony_ci		}
217662306a36Sopenharmony_ci
217762306a36Sopenharmony_ci		/*
217862306a36Sopenharmony_ci		 * Someone else might have written the buffer synchronously or
217962306a36Sopenharmony_ci		 * marked it stale in the meantime.  In that case only the
218062306a36Sopenharmony_ci		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
218162306a36Sopenharmony_ci		 * reference and remove it from the list here.
218262306a36Sopenharmony_ci		 */
218362306a36Sopenharmony_ci		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
218462306a36Sopenharmony_ci			list_del_init(&bp->b_list);
218562306a36Sopenharmony_ci			xfs_buf_relse(bp);
218662306a36Sopenharmony_ci			continue;
218762306a36Sopenharmony_ci		}
218862306a36Sopenharmony_ci
218962306a36Sopenharmony_ci		trace_xfs_buf_delwri_split(bp, _RET_IP_);
219062306a36Sopenharmony_ci
219162306a36Sopenharmony_ci		/*
219262306a36Sopenharmony_ci		 * If we have a wait list, each buffer (and associated delwri
219362306a36Sopenharmony_ci		 * queue reference) transfers to it and is submitted
219462306a36Sopenharmony_ci		 * synchronously. Otherwise, drop the buffer from the delwri
219562306a36Sopenharmony_ci		 * queue and submit async.
219662306a36Sopenharmony_ci		 */
219762306a36Sopenharmony_ci		bp->b_flags &= ~_XBF_DELWRI_Q;
219862306a36Sopenharmony_ci		bp->b_flags |= XBF_WRITE;
219962306a36Sopenharmony_ci		if (wait_list) {
220062306a36Sopenharmony_ci			bp->b_flags &= ~XBF_ASYNC;
220162306a36Sopenharmony_ci			list_move_tail(&bp->b_list, wait_list);
220262306a36Sopenharmony_ci		} else {
220362306a36Sopenharmony_ci			bp->b_flags |= XBF_ASYNC;
220462306a36Sopenharmony_ci			list_del_init(&bp->b_list);
220562306a36Sopenharmony_ci		}
220662306a36Sopenharmony_ci		__xfs_buf_submit(bp, false);
220762306a36Sopenharmony_ci	}
220862306a36Sopenharmony_ci	blk_finish_plug(&plug);
220962306a36Sopenharmony_ci
221062306a36Sopenharmony_ci	return pinned;
221162306a36Sopenharmony_ci}
221262306a36Sopenharmony_ci
221362306a36Sopenharmony_ci/*
221462306a36Sopenharmony_ci * Write out a buffer list asynchronously.
221562306a36Sopenharmony_ci *
221662306a36Sopenharmony_ci * This will take the @buffer_list, write all non-locked and non-pinned buffers
221762306a36Sopenharmony_ci * out and not wait for I/O completion on any of the buffers.  This interface
221862306a36Sopenharmony_ci * is only safely useable for callers that can track I/O completion by higher
221962306a36Sopenharmony_ci * level means, e.g. AIL pushing as the @buffer_list is consumed in this
222062306a36Sopenharmony_ci * function.
222162306a36Sopenharmony_ci *
222262306a36Sopenharmony_ci * Note: this function will skip buffers it would block on, and in doing so
222362306a36Sopenharmony_ci * leaves them on @buffer_list so they can be retried on a later pass. As such,
222462306a36Sopenharmony_ci * it is up to the caller to ensure that the buffer list is fully submitted or
222562306a36Sopenharmony_ci * cancelled appropriately when they are finished with the list. Failure to
222662306a36Sopenharmony_ci * cancel or resubmit the list until it is empty will result in leaked buffers
222762306a36Sopenharmony_ci * at unmount time.
222862306a36Sopenharmony_ci */
222962306a36Sopenharmony_ciint
223062306a36Sopenharmony_cixfs_buf_delwri_submit_nowait(
223162306a36Sopenharmony_ci	struct list_head	*buffer_list)
223262306a36Sopenharmony_ci{
223362306a36Sopenharmony_ci	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
223462306a36Sopenharmony_ci}
223562306a36Sopenharmony_ci
223662306a36Sopenharmony_ci/*
223762306a36Sopenharmony_ci * Write out a buffer list synchronously.
223862306a36Sopenharmony_ci *
223962306a36Sopenharmony_ci * This will take the @buffer_list, write all buffers out and wait for I/O
224062306a36Sopenharmony_ci * completion on all of the buffers. @buffer_list is consumed by the function,
224162306a36Sopenharmony_ci * so callers must have some other way of tracking buffers if they require such
224262306a36Sopenharmony_ci * functionality.
224362306a36Sopenharmony_ci */
224462306a36Sopenharmony_ciint
224562306a36Sopenharmony_cixfs_buf_delwri_submit(
224662306a36Sopenharmony_ci	struct list_head	*buffer_list)
224762306a36Sopenharmony_ci{
224862306a36Sopenharmony_ci	LIST_HEAD		(wait_list);
224962306a36Sopenharmony_ci	int			error = 0, error2;
225062306a36Sopenharmony_ci	struct xfs_buf		*bp;
225162306a36Sopenharmony_ci
225262306a36Sopenharmony_ci	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
225362306a36Sopenharmony_ci
225462306a36Sopenharmony_ci	/* Wait for IO to complete. */
225562306a36Sopenharmony_ci	while (!list_empty(&wait_list)) {
225662306a36Sopenharmony_ci		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
225762306a36Sopenharmony_ci
225862306a36Sopenharmony_ci		list_del_init(&bp->b_list);
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci		/*
226162306a36Sopenharmony_ci		 * Wait on the locked buffer, check for errors and unlock and
226262306a36Sopenharmony_ci		 * release the delwri queue reference.
226362306a36Sopenharmony_ci		 */
226462306a36Sopenharmony_ci		error2 = xfs_buf_iowait(bp);
226562306a36Sopenharmony_ci		xfs_buf_relse(bp);
226662306a36Sopenharmony_ci		if (!error)
226762306a36Sopenharmony_ci			error = error2;
226862306a36Sopenharmony_ci	}
226962306a36Sopenharmony_ci
227062306a36Sopenharmony_ci	return error;
227162306a36Sopenharmony_ci}
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_ci/*
227462306a36Sopenharmony_ci * Push a single buffer on a delwri queue.
227562306a36Sopenharmony_ci *
227662306a36Sopenharmony_ci * The purpose of this function is to submit a single buffer of a delwri queue
227762306a36Sopenharmony_ci * and return with the buffer still on the original queue. The waiting delwri
227862306a36Sopenharmony_ci * buffer submission infrastructure guarantees transfer of the delwri queue
227962306a36Sopenharmony_ci * buffer reference to a temporary wait list. We reuse this infrastructure to
228062306a36Sopenharmony_ci * transfer the buffer back to the original queue.
228162306a36Sopenharmony_ci *
228262306a36Sopenharmony_ci * Note the buffer transitions from the queued state, to the submitted and wait
228362306a36Sopenharmony_ci * listed state and back to the queued state during this call. The buffer
228462306a36Sopenharmony_ci * locking and queue management logic between _delwri_pushbuf() and
228562306a36Sopenharmony_ci * _delwri_queue() guarantee that the buffer cannot be queued to another list
228662306a36Sopenharmony_ci * before returning.
228762306a36Sopenharmony_ci */
228862306a36Sopenharmony_ciint
228962306a36Sopenharmony_cixfs_buf_delwri_pushbuf(
229062306a36Sopenharmony_ci	struct xfs_buf		*bp,
229162306a36Sopenharmony_ci	struct list_head	*buffer_list)
229262306a36Sopenharmony_ci{
229362306a36Sopenharmony_ci	LIST_HEAD		(submit_list);
229462306a36Sopenharmony_ci	int			error;
229562306a36Sopenharmony_ci
229662306a36Sopenharmony_ci	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
229762306a36Sopenharmony_ci
229862306a36Sopenharmony_ci	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
229962306a36Sopenharmony_ci
230062306a36Sopenharmony_ci	/*
230162306a36Sopenharmony_ci	 * Isolate the buffer to a new local list so we can submit it for I/O
230262306a36Sopenharmony_ci	 * independently from the rest of the original list.
230362306a36Sopenharmony_ci	 */
230462306a36Sopenharmony_ci	xfs_buf_lock(bp);
230562306a36Sopenharmony_ci	list_move(&bp->b_list, &submit_list);
230662306a36Sopenharmony_ci	xfs_buf_unlock(bp);
230762306a36Sopenharmony_ci
230862306a36Sopenharmony_ci	/*
230962306a36Sopenharmony_ci	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
231062306a36Sopenharmony_ci	 * the buffer on the wait list with the original reference. Rather than
231162306a36Sopenharmony_ci	 * bounce the buffer from a local wait list back to the original list
231262306a36Sopenharmony_ci	 * after I/O completion, reuse the original list as the wait list.
231362306a36Sopenharmony_ci	 */
231462306a36Sopenharmony_ci	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
231562306a36Sopenharmony_ci
231662306a36Sopenharmony_ci	/*
231762306a36Sopenharmony_ci	 * The buffer is now locked, under I/O and wait listed on the original
231862306a36Sopenharmony_ci	 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
231962306a36Sopenharmony_ci	 * return with the buffer unlocked and on the original queue.
232062306a36Sopenharmony_ci	 */
232162306a36Sopenharmony_ci	error = xfs_buf_iowait(bp);
232262306a36Sopenharmony_ci	bp->b_flags |= _XBF_DELWRI_Q;
232362306a36Sopenharmony_ci	xfs_buf_unlock(bp);
232462306a36Sopenharmony_ci
232562306a36Sopenharmony_ci	return error;
232662306a36Sopenharmony_ci}
232762306a36Sopenharmony_ci
232862306a36Sopenharmony_civoid xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
232962306a36Sopenharmony_ci{
233062306a36Sopenharmony_ci	/*
233162306a36Sopenharmony_ci	 * Set the lru reference count to 0 based on the error injection tag.
233262306a36Sopenharmony_ci	 * This allows userspace to disrupt buffer caching for debug/testing
233362306a36Sopenharmony_ci	 * purposes.
233462306a36Sopenharmony_ci	 */
233562306a36Sopenharmony_ci	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
233662306a36Sopenharmony_ci		lru_ref = 0;
233762306a36Sopenharmony_ci
233862306a36Sopenharmony_ci	atomic_set(&bp->b_lru_ref, lru_ref);
233962306a36Sopenharmony_ci}
234062306a36Sopenharmony_ci
234162306a36Sopenharmony_ci/*
234262306a36Sopenharmony_ci * Verify an on-disk magic value against the magic value specified in the
234362306a36Sopenharmony_ci * verifier structure. The verifier magic is in disk byte order so the caller is
234462306a36Sopenharmony_ci * expected to pass the value directly from disk.
234562306a36Sopenharmony_ci */
234662306a36Sopenharmony_cibool
234762306a36Sopenharmony_cixfs_verify_magic(
234862306a36Sopenharmony_ci	struct xfs_buf		*bp,
234962306a36Sopenharmony_ci	__be32			dmagic)
235062306a36Sopenharmony_ci{
235162306a36Sopenharmony_ci	struct xfs_mount	*mp = bp->b_mount;
235262306a36Sopenharmony_ci	int			idx;
235362306a36Sopenharmony_ci
235462306a36Sopenharmony_ci	idx = xfs_has_crc(mp);
235562306a36Sopenharmony_ci	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
235662306a36Sopenharmony_ci		return false;
235762306a36Sopenharmony_ci	return dmagic == bp->b_ops->magic[idx];
235862306a36Sopenharmony_ci}
235962306a36Sopenharmony_ci/*
236062306a36Sopenharmony_ci * Verify an on-disk magic value against the magic value specified in the
236162306a36Sopenharmony_ci * verifier structure. The verifier magic is in disk byte order so the caller is
236262306a36Sopenharmony_ci * expected to pass the value directly from disk.
236362306a36Sopenharmony_ci */
236462306a36Sopenharmony_cibool
236562306a36Sopenharmony_cixfs_verify_magic16(
236662306a36Sopenharmony_ci	struct xfs_buf		*bp,
236762306a36Sopenharmony_ci	__be16			dmagic)
236862306a36Sopenharmony_ci{
236962306a36Sopenharmony_ci	struct xfs_mount	*mp = bp->b_mount;
237062306a36Sopenharmony_ci	int			idx;
237162306a36Sopenharmony_ci
237262306a36Sopenharmony_ci	idx = xfs_has_crc(mp);
237362306a36Sopenharmony_ci	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
237462306a36Sopenharmony_ci		return false;
237562306a36Sopenharmony_ci	return dmagic == bp->b_ops->magic16[idx];
237662306a36Sopenharmony_ci}
2377