xref: /kernel/linux/linux-5.10/fs/xfs/xfs_buf.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include <linux/backing-dev.h>
8
9#include "xfs_shared.h"
10#include "xfs_format.h"
11#include "xfs_log_format.h"
12#include "xfs_trans_resv.h"
13#include "xfs_sb.h"
14#include "xfs_mount.h"
15#include "xfs_trace.h"
16#include "xfs_log.h"
17#include "xfs_log_recover.h"
18#include "xfs_trans.h"
19#include "xfs_buf_item.h"
20#include "xfs_errortag.h"
21#include "xfs_error.h"
22
23static kmem_zone_t *xfs_buf_zone;
24
25#define xb_to_gfp(flags) \
26	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
27
28/*
29 * Locking orders
30 *
31 * xfs_buf_ioacct_inc:
32 * xfs_buf_ioacct_dec:
33 *	b_sema (caller holds)
34 *	  b_lock
35 *
36 * xfs_buf_stale:
37 *	b_sema (caller holds)
38 *	  b_lock
39 *	    lru_lock
40 *
41 * xfs_buf_rele:
42 *	b_lock
43 *	  pag_buf_lock
44 *	    lru_lock
45 *
46 * xfs_buftarg_wait_rele
47 *	lru_lock
48 *	  b_lock (trylock due to inversion)
49 *
50 * xfs_buftarg_isolate
51 *	lru_lock
52 *	  b_lock (trylock due to inversion)
53 */
54
55static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
56
57static inline int
58xfs_buf_submit(
59	struct xfs_buf		*bp)
60{
61	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
62}
63
64static inline int
65xfs_buf_is_vmapped(
66	struct xfs_buf	*bp)
67{
68	/*
69	 * Return true if the buffer is vmapped.
70	 *
71	 * b_addr is null if the buffer is not mapped, but the code is clever
72	 * enough to know it doesn't have to map a single page, so the check has
73	 * to be both for b_addr and bp->b_page_count > 1.
74	 */
75	return bp->b_addr && bp->b_page_count > 1;
76}
77
78static inline int
79xfs_buf_vmap_len(
80	struct xfs_buf	*bp)
81{
82	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
83}
84
85/*
86 * Bump the I/O in flight count on the buftarg if we haven't yet done so for
87 * this buffer. The count is incremented once per buffer (per hold cycle)
88 * because the corresponding decrement is deferred to buffer release. Buffers
89 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
90 * tracking adds unnecessary overhead. This is used for sychronization purposes
91 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
92 * in-flight buffers.
93 *
94 * Buffers that are never released (e.g., superblock, iclog buffers) must set
95 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
96 * never reaches zero and unmount hangs indefinitely.
97 */
98static inline void
99xfs_buf_ioacct_inc(
100	struct xfs_buf	*bp)
101{
102	if (bp->b_flags & XBF_NO_IOACCT)
103		return;
104
105	ASSERT(bp->b_flags & XBF_ASYNC);
106	spin_lock(&bp->b_lock);
107	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
108		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
109		percpu_counter_inc(&bp->b_target->bt_io_count);
110	}
111	spin_unlock(&bp->b_lock);
112}
113
114/*
115 * Clear the in-flight state on a buffer about to be released to the LRU or
116 * freed and unaccount from the buftarg.
117 */
118static inline void
119__xfs_buf_ioacct_dec(
120	struct xfs_buf	*bp)
121{
122	lockdep_assert_held(&bp->b_lock);
123
124	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
125		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
126		percpu_counter_dec(&bp->b_target->bt_io_count);
127	}
128}
129
130static inline void
131xfs_buf_ioacct_dec(
132	struct xfs_buf	*bp)
133{
134	spin_lock(&bp->b_lock);
135	__xfs_buf_ioacct_dec(bp);
136	spin_unlock(&bp->b_lock);
137}
138
139/*
140 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
141 * b_lru_ref count so that the buffer is freed immediately when the buffer
142 * reference count falls to zero. If the buffer is already on the LRU, we need
143 * to remove the reference that LRU holds on the buffer.
144 *
145 * This prevents build-up of stale buffers on the LRU.
146 */
147void
148xfs_buf_stale(
149	struct xfs_buf	*bp)
150{
151	ASSERT(xfs_buf_islocked(bp));
152
153	bp->b_flags |= XBF_STALE;
154
155	/*
156	 * Clear the delwri status so that a delwri queue walker will not
157	 * flush this buffer to disk now that it is stale. The delwri queue has
158	 * a reference to the buffer, so this is safe to do.
159	 */
160	bp->b_flags &= ~_XBF_DELWRI_Q;
161
162	/*
163	 * Once the buffer is marked stale and unlocked, a subsequent lookup
164	 * could reset b_flags. There is no guarantee that the buffer is
165	 * unaccounted (released to LRU) before that occurs. Drop in-flight
166	 * status now to preserve accounting consistency.
167	 */
168	spin_lock(&bp->b_lock);
169	__xfs_buf_ioacct_dec(bp);
170
171	atomic_set(&bp->b_lru_ref, 0);
172	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
173	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
174		atomic_dec(&bp->b_hold);
175
176	ASSERT(atomic_read(&bp->b_hold) >= 1);
177	spin_unlock(&bp->b_lock);
178}
179
180static int
181xfs_buf_get_maps(
182	struct xfs_buf		*bp,
183	int			map_count)
184{
185	ASSERT(bp->b_maps == NULL);
186	bp->b_map_count = map_count;
187
188	if (map_count == 1) {
189		bp->b_maps = &bp->__b_map;
190		return 0;
191	}
192
193	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
194				KM_NOFS);
195	if (!bp->b_maps)
196		return -ENOMEM;
197	return 0;
198}
199
200/*
201 *	Frees b_pages if it was allocated.
202 */
203static void
204xfs_buf_free_maps(
205	struct xfs_buf	*bp)
206{
207	if (bp->b_maps != &bp->__b_map) {
208		kmem_free(bp->b_maps);
209		bp->b_maps = NULL;
210	}
211}
212
213static int
214_xfs_buf_alloc(
215	struct xfs_buftarg	*target,
216	struct xfs_buf_map	*map,
217	int			nmaps,
218	xfs_buf_flags_t		flags,
219	struct xfs_buf		**bpp)
220{
221	struct xfs_buf		*bp;
222	int			error;
223	int			i;
224
225	*bpp = NULL;
226	bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
227
228	/*
229	 * We don't want certain flags to appear in b_flags unless they are
230	 * specifically set by later operations on the buffer.
231	 */
232	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
233
234	atomic_set(&bp->b_hold, 1);
235	atomic_set(&bp->b_lru_ref, 1);
236	init_completion(&bp->b_iowait);
237	INIT_LIST_HEAD(&bp->b_lru);
238	INIT_LIST_HEAD(&bp->b_list);
239	INIT_LIST_HEAD(&bp->b_li_list);
240	sema_init(&bp->b_sema, 0); /* held, no waiters */
241	spin_lock_init(&bp->b_lock);
242	bp->b_target = target;
243	bp->b_mount = target->bt_mount;
244	bp->b_flags = flags;
245
246	/*
247	 * Set length and io_length to the same value initially.
248	 * I/O routines should use io_length, which will be the same in
249	 * most cases but may be reset (e.g. XFS recovery).
250	 */
251	error = xfs_buf_get_maps(bp, nmaps);
252	if (error)  {
253		kmem_cache_free(xfs_buf_zone, bp);
254		return error;
255	}
256
257	bp->b_bn = map[0].bm_bn;
258	bp->b_length = 0;
259	for (i = 0; i < nmaps; i++) {
260		bp->b_maps[i].bm_bn = map[i].bm_bn;
261		bp->b_maps[i].bm_len = map[i].bm_len;
262		bp->b_length += map[i].bm_len;
263	}
264
265	atomic_set(&bp->b_pin_count, 0);
266	init_waitqueue_head(&bp->b_waiters);
267
268	XFS_STATS_INC(bp->b_mount, xb_create);
269	trace_xfs_buf_init(bp, _RET_IP_);
270
271	*bpp = bp;
272	return 0;
273}
274
275/*
276 *	Allocate a page array capable of holding a specified number
277 *	of pages, and point the page buf at it.
278 */
279STATIC int
280_xfs_buf_get_pages(
281	xfs_buf_t		*bp,
282	int			page_count)
283{
284	/* Make sure that we have a page list */
285	if (bp->b_pages == NULL) {
286		bp->b_page_count = page_count;
287		if (page_count <= XB_PAGES) {
288			bp->b_pages = bp->b_page_array;
289		} else {
290			bp->b_pages = kmem_alloc(sizeof(struct page *) *
291						 page_count, KM_NOFS);
292			if (bp->b_pages == NULL)
293				return -ENOMEM;
294		}
295		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
296	}
297	return 0;
298}
299
300/*
301 *	Frees b_pages if it was allocated.
302 */
303STATIC void
304_xfs_buf_free_pages(
305	xfs_buf_t	*bp)
306{
307	if (bp->b_pages != bp->b_page_array) {
308		kmem_free(bp->b_pages);
309		bp->b_pages = NULL;
310	}
311}
312
313/*
314 *	Releases the specified buffer.
315 *
316 * 	The modification state of any associated pages is left unchanged.
317 * 	The buffer must not be on any hash - use xfs_buf_rele instead for
318 * 	hashed and refcounted buffers
319 */
320static void
321xfs_buf_free(
322	xfs_buf_t		*bp)
323{
324	trace_xfs_buf_free(bp, _RET_IP_);
325
326	ASSERT(list_empty(&bp->b_lru));
327
328	if (bp->b_flags & _XBF_PAGES) {
329		uint		i;
330
331		if (xfs_buf_is_vmapped(bp))
332			vm_unmap_ram(bp->b_addr - bp->b_offset,
333					bp->b_page_count);
334
335		for (i = 0; i < bp->b_page_count; i++) {
336			struct page	*page = bp->b_pages[i];
337
338			__free_page(page);
339		}
340		if (current->reclaim_state)
341			current->reclaim_state->reclaimed_slab +=
342							bp->b_page_count;
343	} else if (bp->b_flags & _XBF_KMEM)
344		kmem_free(bp->b_addr);
345	_xfs_buf_free_pages(bp);
346	xfs_buf_free_maps(bp);
347	kmem_cache_free(xfs_buf_zone, bp);
348}
349
350/*
351 * Allocates all the pages for buffer in question and builds it's page list.
352 */
353STATIC int
354xfs_buf_allocate_memory(
355	xfs_buf_t		*bp,
356	uint			flags)
357{
358	size_t			size;
359	size_t			nbytes, offset;
360	gfp_t			gfp_mask = xb_to_gfp(flags);
361	unsigned short		page_count, i;
362	xfs_off_t		start, end;
363	int			error;
364	xfs_km_flags_t		kmflag_mask = 0;
365
366	/*
367	 * assure zeroed buffer for non-read cases.
368	 */
369	if (!(flags & XBF_READ)) {
370		kmflag_mask |= KM_ZERO;
371		gfp_mask |= __GFP_ZERO;
372	}
373
374	/*
375	 * for buffers that are contained within a single page, just allocate
376	 * the memory from the heap - there's no need for the complexity of
377	 * page arrays to keep allocation down to order 0.
378	 */
379	size = BBTOB(bp->b_length);
380	if (size < PAGE_SIZE) {
381		int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
382		bp->b_addr = kmem_alloc_io(size, align_mask,
383					   KM_NOFS | kmflag_mask);
384		if (!bp->b_addr) {
385			/* low memory - use alloc_page loop instead */
386			goto use_alloc_page;
387		}
388
389		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
390		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
391			/* b_addr spans two pages - use alloc_page instead */
392			kmem_free(bp->b_addr);
393			bp->b_addr = NULL;
394			goto use_alloc_page;
395		}
396		bp->b_offset = offset_in_page(bp->b_addr);
397		bp->b_pages = bp->b_page_array;
398		bp->b_pages[0] = kmem_to_page(bp->b_addr);
399		bp->b_page_count = 1;
400		bp->b_flags |= _XBF_KMEM;
401		return 0;
402	}
403
404use_alloc_page:
405	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
406	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
407								>> PAGE_SHIFT;
408	page_count = end - start;
409	error = _xfs_buf_get_pages(bp, page_count);
410	if (unlikely(error))
411		return error;
412
413	offset = bp->b_offset;
414	bp->b_flags |= _XBF_PAGES;
415
416	for (i = 0; i < bp->b_page_count; i++) {
417		struct page	*page;
418		uint		retries = 0;
419retry:
420		page = alloc_page(gfp_mask);
421		if (unlikely(page == NULL)) {
422			if (flags & XBF_READ_AHEAD) {
423				bp->b_page_count = i;
424				error = -ENOMEM;
425				goto out_free_pages;
426			}
427
428			/*
429			 * This could deadlock.
430			 *
431			 * But until all the XFS lowlevel code is revamped to
432			 * handle buffer allocation failures we can't do much.
433			 */
434			if (!(++retries % 100))
435				xfs_err(NULL,
436		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
437					current->comm, current->pid,
438					__func__, gfp_mask);
439
440			XFS_STATS_INC(bp->b_mount, xb_page_retries);
441			congestion_wait(BLK_RW_ASYNC, HZ/50);
442			goto retry;
443		}
444
445		XFS_STATS_INC(bp->b_mount, xb_page_found);
446
447		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
448		size -= nbytes;
449		bp->b_pages[i] = page;
450		offset = 0;
451	}
452	return 0;
453
454out_free_pages:
455	for (i = 0; i < bp->b_page_count; i++)
456		__free_page(bp->b_pages[i]);
457	bp->b_flags &= ~_XBF_PAGES;
458	return error;
459}
460
461/*
462 *	Map buffer into kernel address-space if necessary.
463 */
464STATIC int
465_xfs_buf_map_pages(
466	xfs_buf_t		*bp,
467	uint			flags)
468{
469	ASSERT(bp->b_flags & _XBF_PAGES);
470	if (bp->b_page_count == 1) {
471		/* A single page buffer is always mappable */
472		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
473	} else if (flags & XBF_UNMAPPED) {
474		bp->b_addr = NULL;
475	} else {
476		int retried = 0;
477		unsigned nofs_flag;
478
479		/*
480		 * vm_map_ram() will allocate auxiliary structures (e.g.
481		 * pagetables) with GFP_KERNEL, yet we are likely to be under
482		 * GFP_NOFS context here. Hence we need to tell memory reclaim
483		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
484		 * memory reclaim re-entering the filesystem here and
485		 * potentially deadlocking.
486		 */
487		nofs_flag = memalloc_nofs_save();
488		do {
489			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
490						-1);
491			if (bp->b_addr)
492				break;
493			vm_unmap_aliases();
494		} while (retried++ <= 1);
495		memalloc_nofs_restore(nofs_flag);
496
497		if (!bp->b_addr)
498			return -ENOMEM;
499		bp->b_addr += bp->b_offset;
500	}
501
502	return 0;
503}
504
505/*
506 *	Finding and Reading Buffers
507 */
508static int
509_xfs_buf_obj_cmp(
510	struct rhashtable_compare_arg	*arg,
511	const void			*obj)
512{
513	const struct xfs_buf_map	*map = arg->key;
514	const struct xfs_buf		*bp = obj;
515
516	/*
517	 * The key hashing in the lookup path depends on the key being the
518	 * first element of the compare_arg, make sure to assert this.
519	 */
520	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
521
522	if (bp->b_bn != map->bm_bn)
523		return 1;
524
525	if (unlikely(bp->b_length != map->bm_len)) {
526		/*
527		 * found a block number match. If the range doesn't
528		 * match, the only way this is allowed is if the buffer
529		 * in the cache is stale and the transaction that made
530		 * it stale has not yet committed. i.e. we are
531		 * reallocating a busy extent. Skip this buffer and
532		 * continue searching for an exact match.
533		 */
534		ASSERT(bp->b_flags & XBF_STALE);
535		return 1;
536	}
537	return 0;
538}
539
540static const struct rhashtable_params xfs_buf_hash_params = {
541	.min_size		= 32,	/* empty AGs have minimal footprint */
542	.nelem_hint		= 16,
543	.key_len		= sizeof(xfs_daddr_t),
544	.key_offset		= offsetof(struct xfs_buf, b_bn),
545	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
546	.automatic_shrinking	= true,
547	.obj_cmpfn		= _xfs_buf_obj_cmp,
548};
549
550int
551xfs_buf_hash_init(
552	struct xfs_perag	*pag)
553{
554	spin_lock_init(&pag->pag_buf_lock);
555	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
556}
557
558void
559xfs_buf_hash_destroy(
560	struct xfs_perag	*pag)
561{
562	rhashtable_destroy(&pag->pag_buf_hash);
563}
564
565/*
566 * Look up a buffer in the buffer cache and return it referenced and locked
567 * in @found_bp.
568 *
569 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
570 * cache.
571 *
572 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
573 * -EAGAIN if we fail to lock it.
574 *
575 * Return values are:
576 *	-EFSCORRUPTED if have been supplied with an invalid address
577 *	-EAGAIN on trylock failure
578 *	-ENOENT if we fail to find a match and @new_bp was NULL
579 *	0, with @found_bp:
580 *		- @new_bp if we inserted it into the cache
581 *		- the buffer we found and locked.
582 */
583static int
584xfs_buf_find(
585	struct xfs_buftarg	*btp,
586	struct xfs_buf_map	*map,
587	int			nmaps,
588	xfs_buf_flags_t		flags,
589	struct xfs_buf		*new_bp,
590	struct xfs_buf		**found_bp)
591{
592	struct xfs_perag	*pag;
593	xfs_buf_t		*bp;
594	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
595	xfs_daddr_t		eofs;
596	int			i;
597
598	*found_bp = NULL;
599
600	for (i = 0; i < nmaps; i++)
601		cmap.bm_len += map[i].bm_len;
602
603	/* Check for IOs smaller than the sector size / not sector aligned */
604	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
605	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
606
607	/*
608	 * Corrupted block numbers can get through to here, unfortunately, so we
609	 * have to check that the buffer falls within the filesystem bounds.
610	 */
611	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
612	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
613		xfs_alert(btp->bt_mount,
614			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
615			  __func__, cmap.bm_bn, eofs);
616		WARN_ON(1);
617		return -EFSCORRUPTED;
618	}
619
620	pag = xfs_perag_get(btp->bt_mount,
621			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
622
623	spin_lock(&pag->pag_buf_lock);
624	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
625				    xfs_buf_hash_params);
626	if (bp) {
627		atomic_inc(&bp->b_hold);
628		goto found;
629	}
630
631	/* No match found */
632	if (!new_bp) {
633		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
634		spin_unlock(&pag->pag_buf_lock);
635		xfs_perag_put(pag);
636		return -ENOENT;
637	}
638
639	/* the buffer keeps the perag reference until it is freed */
640	new_bp->b_pag = pag;
641	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
642			       xfs_buf_hash_params);
643	spin_unlock(&pag->pag_buf_lock);
644	*found_bp = new_bp;
645	return 0;
646
647found:
648	spin_unlock(&pag->pag_buf_lock);
649	xfs_perag_put(pag);
650
651	if (!xfs_buf_trylock(bp)) {
652		if (flags & XBF_TRYLOCK) {
653			xfs_buf_rele(bp);
654			XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
655			return -EAGAIN;
656		}
657		xfs_buf_lock(bp);
658		XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
659	}
660
661	/*
662	 * if the buffer is stale, clear all the external state associated with
663	 * it. We need to keep flags such as how we allocated the buffer memory
664	 * intact here.
665	 */
666	if (bp->b_flags & XBF_STALE) {
667		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
668		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
669		bp->b_ops = NULL;
670	}
671
672	trace_xfs_buf_find(bp, flags, _RET_IP_);
673	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
674	*found_bp = bp;
675	return 0;
676}
677
678struct xfs_buf *
679xfs_buf_incore(
680	struct xfs_buftarg	*target,
681	xfs_daddr_t		blkno,
682	size_t			numblks,
683	xfs_buf_flags_t		flags)
684{
685	struct xfs_buf		*bp;
686	int			error;
687	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
688
689	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
690	if (error)
691		return NULL;
692	return bp;
693}
694
695/*
696 * Assembles a buffer covering the specified range. The code is optimised for
697 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
698 * more hits than misses.
699 */
700int
701xfs_buf_get_map(
702	struct xfs_buftarg	*target,
703	struct xfs_buf_map	*map,
704	int			nmaps,
705	xfs_buf_flags_t		flags,
706	struct xfs_buf		**bpp)
707{
708	struct xfs_buf		*bp;
709	struct xfs_buf		*new_bp;
710	int			error = 0;
711
712	*bpp = NULL;
713	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
714	if (!error)
715		goto found;
716	if (error != -ENOENT)
717		return error;
718
719	error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
720	if (error)
721		return error;
722
723	error = xfs_buf_allocate_memory(new_bp, flags);
724	if (error) {
725		xfs_buf_free(new_bp);
726		return error;
727	}
728
729	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
730	if (error) {
731		xfs_buf_free(new_bp);
732		return error;
733	}
734
735	if (bp != new_bp)
736		xfs_buf_free(new_bp);
737
738found:
739	if (!bp->b_addr) {
740		error = _xfs_buf_map_pages(bp, flags);
741		if (unlikely(error)) {
742			xfs_warn_ratelimited(target->bt_mount,
743				"%s: failed to map %u pages", __func__,
744				bp->b_page_count);
745			xfs_buf_relse(bp);
746			return error;
747		}
748	}
749
750	/*
751	 * Clear b_error if this is a lookup from a caller that doesn't expect
752	 * valid data to be found in the buffer.
753	 */
754	if (!(flags & XBF_READ))
755		xfs_buf_ioerror(bp, 0);
756
757	XFS_STATS_INC(target->bt_mount, xb_get);
758	trace_xfs_buf_get(bp, flags, _RET_IP_);
759	*bpp = bp;
760	return 0;
761}
762
763int
764_xfs_buf_read(
765	xfs_buf_t		*bp,
766	xfs_buf_flags_t		flags)
767{
768	ASSERT(!(flags & XBF_WRITE));
769	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
770
771	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
772	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
773
774	return xfs_buf_submit(bp);
775}
776
777/*
778 * Reverify a buffer found in cache without an attached ->b_ops.
779 *
780 * If the caller passed an ops structure and the buffer doesn't have ops
781 * assigned, set the ops and use it to verify the contents. If verification
782 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
783 * already in XBF_DONE state on entry.
784 *
785 * Under normal operations, every in-core buffer is verified on read I/O
786 * completion. There are two scenarios that can lead to in-core buffers without
787 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
788 * filesystem, though these buffers are purged at the end of recovery. The
789 * other is online repair, which intentionally reads with a NULL buffer ops to
790 * run several verifiers across an in-core buffer in order to establish buffer
791 * type.  If repair can't establish that, the buffer will be left in memory
792 * with NULL buffer ops.
793 */
794int
795xfs_buf_reverify(
796	struct xfs_buf		*bp,
797	const struct xfs_buf_ops *ops)
798{
799	ASSERT(bp->b_flags & XBF_DONE);
800	ASSERT(bp->b_error == 0);
801
802	if (!ops || bp->b_ops)
803		return 0;
804
805	bp->b_ops = ops;
806	bp->b_ops->verify_read(bp);
807	if (bp->b_error)
808		bp->b_flags &= ~XBF_DONE;
809	return bp->b_error;
810}
811
812int
813xfs_buf_read_map(
814	struct xfs_buftarg	*target,
815	struct xfs_buf_map	*map,
816	int			nmaps,
817	xfs_buf_flags_t		flags,
818	struct xfs_buf		**bpp,
819	const struct xfs_buf_ops *ops,
820	xfs_failaddr_t		fa)
821{
822	struct xfs_buf		*bp;
823	int			error;
824
825	flags |= XBF_READ;
826	*bpp = NULL;
827
828	error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
829	if (error)
830		return error;
831
832	trace_xfs_buf_read(bp, flags, _RET_IP_);
833
834	if (!(bp->b_flags & XBF_DONE)) {
835		/* Initiate the buffer read and wait. */
836		XFS_STATS_INC(target->bt_mount, xb_get_read);
837		bp->b_ops = ops;
838		error = _xfs_buf_read(bp, flags);
839
840		/* Readahead iodone already dropped the buffer, so exit. */
841		if (flags & XBF_ASYNC)
842			return 0;
843	} else {
844		/* Buffer already read; all we need to do is check it. */
845		error = xfs_buf_reverify(bp, ops);
846
847		/* Readahead already finished; drop the buffer and exit. */
848		if (flags & XBF_ASYNC) {
849			xfs_buf_relse(bp);
850			return 0;
851		}
852
853		/* We do not want read in the flags */
854		bp->b_flags &= ~XBF_READ;
855		ASSERT(bp->b_ops != NULL || ops == NULL);
856	}
857
858	/*
859	 * If we've had a read error, then the contents of the buffer are
860	 * invalid and should not be used. To ensure that a followup read tries
861	 * to pull the buffer from disk again, we clear the XBF_DONE flag and
862	 * mark the buffer stale. This ensures that anyone who has a current
863	 * reference to the buffer will interpret it's contents correctly and
864	 * future cache lookups will also treat it as an empty, uninitialised
865	 * buffer.
866	 */
867	if (error) {
868		if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
869			xfs_buf_ioerror_alert(bp, fa);
870
871		bp->b_flags &= ~XBF_DONE;
872		xfs_buf_stale(bp);
873		xfs_buf_relse(bp);
874
875		/* bad CRC means corrupted metadata */
876		if (error == -EFSBADCRC)
877			error = -EFSCORRUPTED;
878		return error;
879	}
880
881	*bpp = bp;
882	return 0;
883}
884
885/*
886 *	If we are not low on memory then do the readahead in a deadlock
887 *	safe manner.
888 */
889void
890xfs_buf_readahead_map(
891	struct xfs_buftarg	*target,
892	struct xfs_buf_map	*map,
893	int			nmaps,
894	const struct xfs_buf_ops *ops)
895{
896	struct xfs_buf		*bp;
897
898	if (bdi_read_congested(target->bt_bdev->bd_bdi))
899		return;
900
901	xfs_buf_read_map(target, map, nmaps,
902		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
903		     __this_address);
904}
905
906/*
907 * Read an uncached buffer from disk. Allocates and returns a locked
908 * buffer containing the disk contents or nothing.
909 */
910int
911xfs_buf_read_uncached(
912	struct xfs_buftarg	*target,
913	xfs_daddr_t		daddr,
914	size_t			numblks,
915	int			flags,
916	struct xfs_buf		**bpp,
917	const struct xfs_buf_ops *ops)
918{
919	struct xfs_buf		*bp;
920	int			error;
921
922	*bpp = NULL;
923
924	error = xfs_buf_get_uncached(target, numblks, flags, &bp);
925	if (error)
926		return error;
927
928	/* set up the buffer for a read IO */
929	ASSERT(bp->b_map_count == 1);
930	bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
931	bp->b_maps[0].bm_bn = daddr;
932	bp->b_flags |= XBF_READ;
933	bp->b_ops = ops;
934
935	xfs_buf_submit(bp);
936	if (bp->b_error) {
937		error = bp->b_error;
938		xfs_buf_relse(bp);
939		return error;
940	}
941
942	*bpp = bp;
943	return 0;
944}
945
946int
947xfs_buf_get_uncached(
948	struct xfs_buftarg	*target,
949	size_t			numblks,
950	int			flags,
951	struct xfs_buf		**bpp)
952{
953	unsigned long		page_count;
954	int			error, i;
955	struct xfs_buf		*bp;
956	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
957
958	*bpp = NULL;
959
960	/* flags might contain irrelevant bits, pass only what we care about */
961	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
962	if (error)
963		goto fail;
964
965	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
966	error = _xfs_buf_get_pages(bp, page_count);
967	if (error)
968		goto fail_free_buf;
969
970	for (i = 0; i < page_count; i++) {
971		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
972		if (!bp->b_pages[i]) {
973			error = -ENOMEM;
974			goto fail_free_mem;
975		}
976	}
977	bp->b_flags |= _XBF_PAGES;
978
979	error = _xfs_buf_map_pages(bp, 0);
980	if (unlikely(error)) {
981		xfs_warn(target->bt_mount,
982			"%s: failed to map pages", __func__);
983		goto fail_free_mem;
984	}
985
986	trace_xfs_buf_get_uncached(bp, _RET_IP_);
987	*bpp = bp;
988	return 0;
989
990 fail_free_mem:
991	while (--i >= 0)
992		__free_page(bp->b_pages[i]);
993	_xfs_buf_free_pages(bp);
994 fail_free_buf:
995	xfs_buf_free_maps(bp);
996	kmem_cache_free(xfs_buf_zone, bp);
997 fail:
998	return error;
999}
1000
1001/*
1002 *	Increment reference count on buffer, to hold the buffer concurrently
1003 *	with another thread which may release (free) the buffer asynchronously.
1004 *	Must hold the buffer already to call this function.
1005 */
1006void
1007xfs_buf_hold(
1008	xfs_buf_t		*bp)
1009{
1010	trace_xfs_buf_hold(bp, _RET_IP_);
1011	atomic_inc(&bp->b_hold);
1012}
1013
1014/*
1015 * Release a hold on the specified buffer. If the hold count is 1, the buffer is
1016 * placed on LRU or freed (depending on b_lru_ref).
1017 */
1018void
1019xfs_buf_rele(
1020	xfs_buf_t		*bp)
1021{
1022	struct xfs_perag	*pag = bp->b_pag;
1023	bool			release;
1024	bool			freebuf = false;
1025
1026	trace_xfs_buf_rele(bp, _RET_IP_);
1027
1028	if (!pag) {
1029		ASSERT(list_empty(&bp->b_lru));
1030		if (atomic_dec_and_test(&bp->b_hold)) {
1031			xfs_buf_ioacct_dec(bp);
1032			xfs_buf_free(bp);
1033		}
1034		return;
1035	}
1036
1037	ASSERT(atomic_read(&bp->b_hold) > 0);
1038
1039	/*
1040	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
1041	 * calls. The pag_buf_lock being taken on the last reference only
1042	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
1043	 * to last reference we drop here is not serialised against the last
1044	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
1045	 * first, the last "release" reference can win the race to the lock and
1046	 * free the buffer before the second-to-last reference is processed,
1047	 * leading to a use-after-free scenario.
1048	 */
1049	spin_lock(&bp->b_lock);
1050	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
1051	if (!release) {
1052		/*
1053		 * Drop the in-flight state if the buffer is already on the LRU
1054		 * and it holds the only reference. This is racy because we
1055		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
1056		 * ensures the decrement occurs only once per-buf.
1057		 */
1058		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
1059			__xfs_buf_ioacct_dec(bp);
1060		goto out_unlock;
1061	}
1062
1063	/* the last reference has been dropped ... */
1064	__xfs_buf_ioacct_dec(bp);
1065	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
1066		/*
1067		 * If the buffer is added to the LRU take a new reference to the
1068		 * buffer for the LRU and clear the (now stale) dispose list
1069		 * state flag
1070		 */
1071		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
1072			bp->b_state &= ~XFS_BSTATE_DISPOSE;
1073			atomic_inc(&bp->b_hold);
1074		}
1075		spin_unlock(&pag->pag_buf_lock);
1076	} else {
1077		/*
1078		 * most of the time buffers will already be removed from the
1079		 * LRU, so optimise that case by checking for the
1080		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
1081		 * was on was the disposal list
1082		 */
1083		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
1084			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
1085		} else {
1086			ASSERT(list_empty(&bp->b_lru));
1087		}
1088
1089		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1090		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
1091				       xfs_buf_hash_params);
1092		spin_unlock(&pag->pag_buf_lock);
1093		xfs_perag_put(pag);
1094		freebuf = true;
1095	}
1096
1097out_unlock:
1098	spin_unlock(&bp->b_lock);
1099
1100	if (freebuf)
1101		xfs_buf_free(bp);
1102}
1103
1104
1105/*
1106 *	Lock a buffer object, if it is not already locked.
1107 *
1108 *	If we come across a stale, pinned, locked buffer, we know that we are
1109 *	being asked to lock a buffer that has been reallocated. Because it is
1110 *	pinned, we know that the log has not been pushed to disk and hence it
1111 *	will still be locked.  Rather than continuing to have trylock attempts
1112 *	fail until someone else pushes the log, push it ourselves before
1113 *	returning.  This means that the xfsaild will not get stuck trying
1114 *	to push on stale inode buffers.
1115 */
1116int
1117xfs_buf_trylock(
1118	struct xfs_buf		*bp)
1119{
1120	int			locked;
1121
1122	locked = down_trylock(&bp->b_sema) == 0;
1123	if (locked)
1124		trace_xfs_buf_trylock(bp, _RET_IP_);
1125	else
1126		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1127	return locked;
1128}
1129
1130/*
1131 *	Lock a buffer object.
1132 *
1133 *	If we come across a stale, pinned, locked buffer, we know that we
1134 *	are being asked to lock a buffer that has been reallocated. Because
1135 *	it is pinned, we know that the log has not been pushed to disk and
1136 *	hence it will still be locked. Rather than sleeping until someone
1137 *	else pushes the log, push it ourselves before trying to get the lock.
1138 */
1139void
1140xfs_buf_lock(
1141	struct xfs_buf		*bp)
1142{
1143	trace_xfs_buf_lock(bp, _RET_IP_);
1144
1145	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
1146		xfs_log_force(bp->b_mount, 0);
1147	down(&bp->b_sema);
1148
1149	trace_xfs_buf_lock_done(bp, _RET_IP_);
1150}
1151
1152void
1153xfs_buf_unlock(
1154	struct xfs_buf		*bp)
1155{
1156	ASSERT(xfs_buf_islocked(bp));
1157
1158	up(&bp->b_sema);
1159	trace_xfs_buf_unlock(bp, _RET_IP_);
1160}
1161
1162STATIC void
1163xfs_buf_wait_unpin(
1164	xfs_buf_t		*bp)
1165{
1166	DECLARE_WAITQUEUE	(wait, current);
1167
1168	if (atomic_read(&bp->b_pin_count) == 0)
1169		return;
1170
1171	add_wait_queue(&bp->b_waiters, &wait);
1172	for (;;) {
1173		set_current_state(TASK_UNINTERRUPTIBLE);
1174		if (atomic_read(&bp->b_pin_count) == 0)
1175			break;
1176		io_schedule();
1177	}
1178	remove_wait_queue(&bp->b_waiters, &wait);
1179	set_current_state(TASK_RUNNING);
1180}
1181
1182static void
1183xfs_buf_ioerror_alert_ratelimited(
1184	struct xfs_buf		*bp)
1185{
1186	static unsigned long	lasttime;
1187	static struct xfs_buftarg *lasttarg;
1188
1189	if (bp->b_target != lasttarg ||
1190	    time_after(jiffies, (lasttime + 5*HZ))) {
1191		lasttime = jiffies;
1192		xfs_buf_ioerror_alert(bp, __this_address);
1193	}
1194	lasttarg = bp->b_target;
1195}
1196
1197/*
1198 * Account for this latest trip around the retry handler, and decide if
1199 * we've failed enough times to constitute a permanent failure.
1200 */
1201static bool
1202xfs_buf_ioerror_permanent(
1203	struct xfs_buf		*bp,
1204	struct xfs_error_cfg	*cfg)
1205{
1206	struct xfs_mount	*mp = bp->b_mount;
1207
1208	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1209	    ++bp->b_retries > cfg->max_retries)
1210		return true;
1211	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1212	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
1213		return true;
1214
1215	/* At unmount we may treat errors differently */
1216	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
1217		return true;
1218
1219	return false;
1220}
1221
1222/*
1223 * On a sync write or shutdown we just want to stale the buffer and let the
1224 * caller handle the error in bp->b_error appropriately.
1225 *
1226 * If the write was asynchronous then no one will be looking for the error.  If
1227 * this is the first failure of this type, clear the error state and write the
1228 * buffer out again. This means we always retry an async write failure at least
1229 * once, but we also need to set the buffer up to behave correctly now for
1230 * repeated failures.
1231 *
1232 * If we get repeated async write failures, then we take action according to the
1233 * error configuration we have been set up to use.
1234 *
1235 * Returns true if this function took care of error handling and the caller must
1236 * not touch the buffer again.  Return false if the caller should proceed with
1237 * normal I/O completion handling.
1238 */
1239static bool
1240xfs_buf_ioend_handle_error(
1241	struct xfs_buf		*bp)
1242{
1243	struct xfs_mount	*mp = bp->b_mount;
1244	struct xfs_error_cfg	*cfg;
1245
1246	/*
1247	 * If we've already decided to shutdown the filesystem because of I/O
1248	 * errors, there's no point in giving this a retry.
1249	 */
1250	if (XFS_FORCED_SHUTDOWN(mp))
1251		goto out_stale;
1252
1253	xfs_buf_ioerror_alert_ratelimited(bp);
1254
1255	/*
1256	 * We're not going to bother about retrying this during recovery.
1257	 * One strike!
1258	 */
1259	if (bp->b_flags & _XBF_LOGRECOVERY) {
1260		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1261		return false;
1262	}
1263
1264	/*
1265	 * Synchronous writes will have callers process the error.
1266	 */
1267	if (!(bp->b_flags & XBF_ASYNC))
1268		goto out_stale;
1269
1270	trace_xfs_buf_iodone_async(bp, _RET_IP_);
1271
1272	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1273	if (bp->b_last_error != bp->b_error ||
1274	    !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
1275		bp->b_last_error = bp->b_error;
1276		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1277		    !bp->b_first_retry_time)
1278			bp->b_first_retry_time = jiffies;
1279		goto resubmit;
1280	}
1281
1282	/*
1283	 * Permanent error - we need to trigger a shutdown if we haven't already
1284	 * to indicate that inconsistency will result from this action.
1285	 */
1286	if (xfs_buf_ioerror_permanent(bp, cfg)) {
1287		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1288		goto out_stale;
1289	}
1290
1291	/* Still considered a transient error. Caller will schedule retries. */
1292	if (bp->b_flags & _XBF_INODES)
1293		xfs_buf_inode_io_fail(bp);
1294	else if (bp->b_flags & _XBF_DQUOTS)
1295		xfs_buf_dquot_io_fail(bp);
1296	else
1297		ASSERT(list_empty(&bp->b_li_list));
1298	xfs_buf_ioerror(bp, 0);
1299	xfs_buf_relse(bp);
1300	return true;
1301
1302resubmit:
1303	xfs_buf_ioerror(bp, 0);
1304	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
1305	xfs_buf_submit(bp);
1306	return true;
1307out_stale:
1308	xfs_buf_stale(bp);
1309	bp->b_flags |= XBF_DONE;
1310	bp->b_flags &= ~XBF_WRITE;
1311	trace_xfs_buf_error_relse(bp, _RET_IP_);
1312	return false;
1313}
1314
1315static void
1316xfs_buf_ioend(
1317	struct xfs_buf	*bp)
1318{
1319	trace_xfs_buf_iodone(bp, _RET_IP_);
1320
1321	/*
1322	 * Pull in IO completion errors now. We are guaranteed to be running
1323	 * single threaded, so we don't need the lock to read b_io_error.
1324	 */
1325	if (!bp->b_error && bp->b_io_error)
1326		xfs_buf_ioerror(bp, bp->b_io_error);
1327
1328	if (bp->b_flags & XBF_READ) {
1329		if (!bp->b_error && bp->b_ops)
1330			bp->b_ops->verify_read(bp);
1331		if (!bp->b_error)
1332			bp->b_flags |= XBF_DONE;
1333	} else {
1334		if (!bp->b_error) {
1335			bp->b_flags &= ~XBF_WRITE_FAIL;
1336			bp->b_flags |= XBF_DONE;
1337		}
1338
1339		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
1340			return;
1341
1342		/* clear the retry state */
1343		bp->b_last_error = 0;
1344		bp->b_retries = 0;
1345		bp->b_first_retry_time = 0;
1346
1347		/*
1348		 * Note that for things like remote attribute buffers, there may
1349		 * not be a buffer log item here, so processing the buffer log
1350		 * item must remain optional.
1351		 */
1352		if (bp->b_log_item)
1353			xfs_buf_item_done(bp);
1354
1355		if (bp->b_flags & _XBF_INODES)
1356			xfs_buf_inode_iodone(bp);
1357		else if (bp->b_flags & _XBF_DQUOTS)
1358			xfs_buf_dquot_iodone(bp);
1359
1360	}
1361
1362	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
1363			 _XBF_LOGRECOVERY);
1364
1365	if (bp->b_flags & XBF_ASYNC)
1366		xfs_buf_relse(bp);
1367	else
1368		complete(&bp->b_iowait);
1369}
1370
1371static void
1372xfs_buf_ioend_work(
1373	struct work_struct	*work)
1374{
1375	struct xfs_buf		*bp =
1376		container_of(work, xfs_buf_t, b_ioend_work);
1377
1378	xfs_buf_ioend(bp);
1379}
1380
1381static void
1382xfs_buf_ioend_async(
1383	struct xfs_buf	*bp)
1384{
1385	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
1386	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
1387}
1388
1389void
1390__xfs_buf_ioerror(
1391	xfs_buf_t		*bp,
1392	int			error,
1393	xfs_failaddr_t		failaddr)
1394{
1395	ASSERT(error <= 0 && error >= -1000);
1396	bp->b_error = error;
1397	trace_xfs_buf_ioerror(bp, error, failaddr);
1398}
1399
1400void
1401xfs_buf_ioerror_alert(
1402	struct xfs_buf		*bp,
1403	xfs_failaddr_t		func)
1404{
1405	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
1406		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
1407				  func, (uint64_t)XFS_BUF_ADDR(bp),
1408				  bp->b_length, -bp->b_error);
1409}
1410
1411/*
1412 * To simulate an I/O failure, the buffer must be locked and held with at least
1413 * three references. The LRU reference is dropped by the stale call. The buf
1414 * item reference is dropped via ioend processing. The third reference is owned
1415 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
1416 */
1417void
1418xfs_buf_ioend_fail(
1419	struct xfs_buf	*bp)
1420{
1421	bp->b_flags &= ~XBF_DONE;
1422	xfs_buf_stale(bp);
1423	xfs_buf_ioerror(bp, -EIO);
1424	xfs_buf_ioend(bp);
1425}
1426
1427int
1428xfs_bwrite(
1429	struct xfs_buf		*bp)
1430{
1431	int			error;
1432
1433	ASSERT(xfs_buf_islocked(bp));
1434
1435	bp->b_flags |= XBF_WRITE;
1436	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
1437			 XBF_DONE);
1438
1439	error = xfs_buf_submit(bp);
1440	if (error)
1441		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
1442	return error;
1443}
1444
1445static void
1446xfs_buf_bio_end_io(
1447	struct bio		*bio)
1448{
1449	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
1450
1451	if (!bio->bi_status &&
1452	    (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
1453	    XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
1454		bio->bi_status = BLK_STS_IOERR;
1455
1456	/*
1457	 * don't overwrite existing errors - otherwise we can lose errors on
1458	 * buffers that require multiple bios to complete.
1459	 */
1460	if (bio->bi_status) {
1461		int error = blk_status_to_errno(bio->bi_status);
1462
1463		cmpxchg(&bp->b_io_error, 0, error);
1464	}
1465
1466	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1467		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1468
1469	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1470		xfs_buf_ioend_async(bp);
1471	bio_put(bio);
1472}
1473
1474static void
1475xfs_buf_ioapply_map(
1476	struct xfs_buf	*bp,
1477	int		map,
1478	int		*buf_offset,
1479	int		*count,
1480	int		op)
1481{
1482	int		page_index;
1483	int		total_nr_pages = bp->b_page_count;
1484	int		nr_pages;
1485	struct bio	*bio;
1486	sector_t	sector =  bp->b_maps[map].bm_bn;
1487	int		size;
1488	int		offset;
1489
1490	/* skip the pages in the buffer before the start offset */
1491	page_index = 0;
1492	offset = *buf_offset;
1493	while (offset >= PAGE_SIZE) {
1494		page_index++;
1495		offset -= PAGE_SIZE;
1496	}
1497
1498	/*
1499	 * Limit the IO size to the length of the current vector, and update the
1500	 * remaining IO count for the next time around.
1501	 */
1502	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
1503	*count -= size;
1504	*buf_offset += size;
1505
1506next_chunk:
1507	atomic_inc(&bp->b_io_remaining);
1508	nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
1509
1510	bio = bio_alloc(GFP_NOIO, nr_pages);
1511	bio_set_dev(bio, bp->b_target->bt_bdev);
1512	bio->bi_iter.bi_sector = sector;
1513	bio->bi_end_io = xfs_buf_bio_end_io;
1514	bio->bi_private = bp;
1515	bio->bi_opf = op;
1516
1517	for (; size && nr_pages; nr_pages--, page_index++) {
1518		int	rbytes, nbytes = PAGE_SIZE - offset;
1519
1520		if (nbytes > size)
1521			nbytes = size;
1522
1523		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
1524				      offset);
1525		if (rbytes < nbytes)
1526			break;
1527
1528		offset = 0;
1529		sector += BTOBB(nbytes);
1530		size -= nbytes;
1531		total_nr_pages--;
1532	}
1533
1534	if (likely(bio->bi_iter.bi_size)) {
1535		if (xfs_buf_is_vmapped(bp)) {
1536			flush_kernel_vmap_range(bp->b_addr,
1537						xfs_buf_vmap_len(bp));
1538		}
1539		submit_bio(bio);
1540		if (size)
1541			goto next_chunk;
1542	} else {
1543		/*
1544		 * This is guaranteed not to be the last io reference count
1545		 * because the caller (xfs_buf_submit) holds a count itself.
1546		 */
1547		atomic_dec(&bp->b_io_remaining);
1548		xfs_buf_ioerror(bp, -EIO);
1549		bio_put(bio);
1550	}
1551
1552}
1553
1554STATIC void
1555_xfs_buf_ioapply(
1556	struct xfs_buf	*bp)
1557{
1558	struct blk_plug	plug;
1559	int		op;
1560	int		offset;
1561	int		size;
1562	int		i;
1563
1564	/*
1565	 * Make sure we capture only current IO errors rather than stale errors
1566	 * left over from previous use of the buffer (e.g. failed readahead).
1567	 */
1568	bp->b_error = 0;
1569
1570	if (bp->b_flags & XBF_WRITE) {
1571		op = REQ_OP_WRITE;
1572
1573		/*
1574		 * Run the write verifier callback function if it exists. If
1575		 * this function fails it will mark the buffer with an error and
1576		 * the IO should not be dispatched.
1577		 */
1578		if (bp->b_ops) {
1579			bp->b_ops->verify_write(bp);
1580			if (bp->b_error) {
1581				xfs_force_shutdown(bp->b_mount,
1582						   SHUTDOWN_CORRUPT_INCORE);
1583				return;
1584			}
1585		} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
1586			struct xfs_mount *mp = bp->b_mount;
1587
1588			/*
1589			 * non-crc filesystems don't attach verifiers during
1590			 * log recovery, so don't warn for such filesystems.
1591			 */
1592			if (xfs_sb_version_hascrc(&mp->m_sb)) {
1593				xfs_warn(mp,
1594					"%s: no buf ops on daddr 0x%llx len %d",
1595					__func__, bp->b_bn, bp->b_length);
1596				xfs_hex_dump(bp->b_addr,
1597						XFS_CORRUPTION_DUMP_LEN);
1598				dump_stack();
1599			}
1600		}
1601	} else {
1602		op = REQ_OP_READ;
1603		if (bp->b_flags & XBF_READ_AHEAD)
1604			op |= REQ_RAHEAD;
1605	}
1606
1607	/* we only use the buffer cache for meta-data */
1608	op |= REQ_META;
1609
1610	/*
1611	 * Walk all the vectors issuing IO on them. Set up the initial offset
1612	 * into the buffer and the desired IO size before we start -
1613	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
1614	 * subsequent call.
1615	 */
1616	offset = bp->b_offset;
1617	size = BBTOB(bp->b_length);
1618	blk_start_plug(&plug);
1619	for (i = 0; i < bp->b_map_count; i++) {
1620		xfs_buf_ioapply_map(bp, i, &offset, &size, op);
1621		if (bp->b_error)
1622			break;
1623		if (size <= 0)
1624			break;	/* all done */
1625	}
1626	blk_finish_plug(&plug);
1627}
1628
1629/*
1630 * Wait for I/O completion of a sync buffer and return the I/O error code.
1631 */
1632static int
1633xfs_buf_iowait(
1634	struct xfs_buf	*bp)
1635{
1636	ASSERT(!(bp->b_flags & XBF_ASYNC));
1637
1638	trace_xfs_buf_iowait(bp, _RET_IP_);
1639	wait_for_completion(&bp->b_iowait);
1640	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1641
1642	return bp->b_error;
1643}
1644
1645/*
1646 * Buffer I/O submission path, read or write. Asynchronous submission transfers
1647 * the buffer lock ownership and the current reference to the IO. It is not
1648 * safe to reference the buffer after a call to this function unless the caller
1649 * holds an additional reference itself.
1650 */
1651static int
1652__xfs_buf_submit(
1653	struct xfs_buf	*bp,
1654	bool		wait)
1655{
1656	int		error = 0;
1657
1658	trace_xfs_buf_submit(bp, _RET_IP_);
1659
1660	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1661
1662	/* on shutdown we stale and complete the buffer immediately */
1663	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1664		xfs_buf_ioend_fail(bp);
1665		return -EIO;
1666	}
1667
1668	/*
1669	 * Grab a reference so the buffer does not go away underneath us. For
1670	 * async buffers, I/O completion drops the callers reference, which
1671	 * could occur before submission returns.
1672	 */
1673	xfs_buf_hold(bp);
1674
1675	if (bp->b_flags & XBF_WRITE)
1676		xfs_buf_wait_unpin(bp);
1677
1678	/* clear the internal error state to avoid spurious errors */
1679	bp->b_io_error = 0;
1680
1681	/*
1682	 * Set the count to 1 initially, this will stop an I/O completion
1683	 * callout which happens before we have started all the I/O from calling
1684	 * xfs_buf_ioend too early.
1685	 */
1686	atomic_set(&bp->b_io_remaining, 1);
1687	if (bp->b_flags & XBF_ASYNC)
1688		xfs_buf_ioacct_inc(bp);
1689	_xfs_buf_ioapply(bp);
1690
1691	/*
1692	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
1693	 * reference we took above. If we drop it to zero, run completion so
1694	 * that we don't return to the caller with completion still pending.
1695	 */
1696	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1697		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
1698			xfs_buf_ioend(bp);
1699		else
1700			xfs_buf_ioend_async(bp);
1701	}
1702
1703	if (wait)
1704		error = xfs_buf_iowait(bp);
1705
1706	/*
1707	 * Release the hold that keeps the buffer referenced for the entire
1708	 * I/O. Note that if the buffer is async, it is not safe to reference
1709	 * after this release.
1710	 */
1711	xfs_buf_rele(bp);
1712	return error;
1713}
1714
1715void *
1716xfs_buf_offset(
1717	struct xfs_buf		*bp,
1718	size_t			offset)
1719{
1720	struct page		*page;
1721
1722	if (bp->b_addr)
1723		return bp->b_addr + offset;
1724
1725	offset += bp->b_offset;
1726	page = bp->b_pages[offset >> PAGE_SHIFT];
1727	return page_address(page) + (offset & (PAGE_SIZE-1));
1728}
1729
1730void
1731xfs_buf_zero(
1732	struct xfs_buf		*bp,
1733	size_t			boff,
1734	size_t			bsize)
1735{
1736	size_t			bend;
1737
1738	bend = boff + bsize;
1739	while (boff < bend) {
1740		struct page	*page;
1741		int		page_index, page_offset, csize;
1742
1743		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1744		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1745		page = bp->b_pages[page_index];
1746		csize = min_t(size_t, PAGE_SIZE - page_offset,
1747				      BBTOB(bp->b_length) - boff);
1748
1749		ASSERT((csize + page_offset) <= PAGE_SIZE);
1750
1751		memset(page_address(page) + page_offset, 0, csize);
1752
1753		boff += csize;
1754	}
1755}
1756
1757/*
1758 * Log a message about and stale a buffer that a caller has decided is corrupt.
1759 *
1760 * This function should be called for the kinds of metadata corruption that
1761 * cannot be detect from a verifier, such as incorrect inter-block relationship
1762 * data.  Do /not/ call this function from a verifier function.
1763 *
1764 * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
1765 * be marked stale, but b_error will not be set.  The caller is responsible for
1766 * releasing the buffer or fixing it.
1767 */
1768void
1769__xfs_buf_mark_corrupt(
1770	struct xfs_buf		*bp,
1771	xfs_failaddr_t		fa)
1772{
1773	ASSERT(bp->b_flags & XBF_DONE);
1774
1775	xfs_buf_corruption_error(bp, fa);
1776	xfs_buf_stale(bp);
1777}
1778
1779/*
1780 *	Handling of buffer targets (buftargs).
1781 */
1782
1783/*
1784 * Wait for any bufs with callbacks that have been submitted but have not yet
1785 * returned. These buffers will have an elevated hold count, so wait on those
1786 * while freeing all the buffers only held by the LRU.
1787 */
1788static enum lru_status
1789xfs_buftarg_wait_rele(
1790	struct list_head	*item,
1791	struct list_lru_one	*lru,
1792	spinlock_t		*lru_lock,
1793	void			*arg)
1794
1795{
1796	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1797	struct list_head	*dispose = arg;
1798
1799	if (atomic_read(&bp->b_hold) > 1) {
1800		/* need to wait, so skip it this pass */
1801		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
1802		return LRU_SKIP;
1803	}
1804	if (!spin_trylock(&bp->b_lock))
1805		return LRU_SKIP;
1806
1807	/*
1808	 * clear the LRU reference count so the buffer doesn't get
1809	 * ignored in xfs_buf_rele().
1810	 */
1811	atomic_set(&bp->b_lru_ref, 0);
1812	bp->b_state |= XFS_BSTATE_DISPOSE;
1813	list_lru_isolate_move(lru, item, dispose);
1814	spin_unlock(&bp->b_lock);
1815	return LRU_REMOVED;
1816}
1817
1818void
1819xfs_wait_buftarg(
1820	struct xfs_buftarg	*btp)
1821{
1822	LIST_HEAD(dispose);
1823	int			loop = 0;
1824	bool			write_fail = false;
1825
1826	/*
1827	 * First wait on the buftarg I/O count for all in-flight buffers to be
1828	 * released. This is critical as new buffers do not make the LRU until
1829	 * they are released.
1830	 *
1831	 * Next, flush the buffer workqueue to ensure all completion processing
1832	 * has finished. Just waiting on buffer locks is not sufficient for
1833	 * async IO as the reference count held over IO is not released until
1834	 * after the buffer lock is dropped. Hence we need to ensure here that
1835	 * all reference counts have been dropped before we start walking the
1836	 * LRU list.
1837	 */
1838	while (percpu_counter_sum(&btp->bt_io_count))
1839		delay(100);
1840	flush_workqueue(btp->bt_mount->m_buf_workqueue);
1841
1842	/* loop until there is nothing left on the lru list. */
1843	while (list_lru_count(&btp->bt_lru)) {
1844		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
1845			      &dispose, LONG_MAX);
1846
1847		while (!list_empty(&dispose)) {
1848			struct xfs_buf *bp;
1849			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1850			list_del_init(&bp->b_lru);
1851			if (bp->b_flags & XBF_WRITE_FAIL) {
1852				write_fail = true;
1853				xfs_buf_alert_ratelimited(bp,
1854					"XFS: Corruption Alert",
1855"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
1856					(long long)bp->b_bn);
1857			}
1858			xfs_buf_rele(bp);
1859		}
1860		if (loop++ != 0)
1861			delay(100);
1862	}
1863
1864	/*
1865	 * If one or more failed buffers were freed, that means dirty metadata
1866	 * was thrown away. This should only ever happen after I/O completion
1867	 * handling has elevated I/O error(s) to permanent failures and shuts
1868	 * down the fs.
1869	 */
1870	if (write_fail) {
1871		ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
1872		xfs_alert(btp->bt_mount,
1873	      "Please run xfs_repair to determine the extent of the problem.");
1874	}
1875}
1876
1877static enum lru_status
1878xfs_buftarg_isolate(
1879	struct list_head	*item,
1880	struct list_lru_one	*lru,
1881	spinlock_t		*lru_lock,
1882	void			*arg)
1883{
1884	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1885	struct list_head	*dispose = arg;
1886
1887	/*
1888	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1889	 * If we fail to get the lock, just skip it.
1890	 */
1891	if (!spin_trylock(&bp->b_lock))
1892		return LRU_SKIP;
1893	/*
1894	 * Decrement the b_lru_ref count unless the value is already
1895	 * zero. If the value is already zero, we need to reclaim the
1896	 * buffer, otherwise it gets another trip through the LRU.
1897	 */
1898	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1899		spin_unlock(&bp->b_lock);
1900		return LRU_ROTATE;
1901	}
1902
1903	bp->b_state |= XFS_BSTATE_DISPOSE;
1904	list_lru_isolate_move(lru, item, dispose);
1905	spin_unlock(&bp->b_lock);
1906	return LRU_REMOVED;
1907}
1908
1909static unsigned long
1910xfs_buftarg_shrink_scan(
1911	struct shrinker		*shrink,
1912	struct shrink_control	*sc)
1913{
1914	struct xfs_buftarg	*btp = container_of(shrink,
1915					struct xfs_buftarg, bt_shrinker);
1916	LIST_HEAD(dispose);
1917	unsigned long		freed;
1918
1919	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1920				     xfs_buftarg_isolate, &dispose);
1921
1922	while (!list_empty(&dispose)) {
1923		struct xfs_buf *bp;
1924		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1925		list_del_init(&bp->b_lru);
1926		xfs_buf_rele(bp);
1927	}
1928
1929	return freed;
1930}
1931
1932static unsigned long
1933xfs_buftarg_shrink_count(
1934	struct shrinker		*shrink,
1935	struct shrink_control	*sc)
1936{
1937	struct xfs_buftarg	*btp = container_of(shrink,
1938					struct xfs_buftarg, bt_shrinker);
1939	return list_lru_shrink_count(&btp->bt_lru, sc);
1940}
1941
1942void
1943xfs_free_buftarg(
1944	struct xfs_buftarg	*btp)
1945{
1946	unregister_shrinker(&btp->bt_shrinker);
1947	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
1948	percpu_counter_destroy(&btp->bt_io_count);
1949	list_lru_destroy(&btp->bt_lru);
1950
1951	xfs_blkdev_issue_flush(btp);
1952
1953	kmem_free(btp);
1954}
1955
1956int
1957xfs_setsize_buftarg(
1958	xfs_buftarg_t		*btp,
1959	unsigned int		sectorsize)
1960{
1961	/* Set up metadata sector size info */
1962	btp->bt_meta_sectorsize = sectorsize;
1963	btp->bt_meta_sectormask = sectorsize - 1;
1964
1965	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1966		xfs_warn(btp->bt_mount,
1967			"Cannot set_blocksize to %u on device %pg",
1968			sectorsize, btp->bt_bdev);
1969		return -EINVAL;
1970	}
1971
1972	/* Set up device logical sector size mask */
1973	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
1974	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
1975
1976	return 0;
1977}
1978
1979/*
1980 * When allocating the initial buffer target we have not yet
1981 * read in the superblock, so don't know what sized sectors
1982 * are being used at this early stage.  Play safe.
1983 */
1984STATIC int
1985xfs_setsize_buftarg_early(
1986	xfs_buftarg_t		*btp,
1987	struct block_device	*bdev)
1988{
1989	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
1990}
1991
1992xfs_buftarg_t *
1993xfs_alloc_buftarg(
1994	struct xfs_mount	*mp,
1995	struct block_device	*bdev,
1996	struct dax_device	*dax_dev)
1997{
1998	xfs_buftarg_t		*btp;
1999
2000	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
2001
2002	btp->bt_mount = mp;
2003	btp->bt_dev =  bdev->bd_dev;
2004	btp->bt_bdev = bdev;
2005	btp->bt_daxdev = dax_dev;
2006
2007	/*
2008	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
2009	 * per 30 seconds so as to not spam logs too much on repeated errors.
2010	 */
2011	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
2012			     DEFAULT_RATELIMIT_BURST);
2013
2014	if (xfs_setsize_buftarg_early(btp, bdev))
2015		goto error_free;
2016
2017	if (list_lru_init(&btp->bt_lru))
2018		goto error_free;
2019
2020	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
2021		goto error_lru;
2022
2023	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
2024	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
2025	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
2026	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
2027	if (register_shrinker(&btp->bt_shrinker))
2028		goto error_pcpu;
2029	return btp;
2030
2031error_pcpu:
2032	percpu_counter_destroy(&btp->bt_io_count);
2033error_lru:
2034	list_lru_destroy(&btp->bt_lru);
2035error_free:
2036	kmem_free(btp);
2037	return NULL;
2038}
2039
2040/*
2041 * Cancel a delayed write list.
2042 *
2043 * Remove each buffer from the list, clear the delwri queue flag and drop the
2044 * associated buffer reference.
2045 */
2046void
2047xfs_buf_delwri_cancel(
2048	struct list_head	*list)
2049{
2050	struct xfs_buf		*bp;
2051
2052	while (!list_empty(list)) {
2053		bp = list_first_entry(list, struct xfs_buf, b_list);
2054
2055		xfs_buf_lock(bp);
2056		bp->b_flags &= ~_XBF_DELWRI_Q;
2057		list_del_init(&bp->b_list);
2058		xfs_buf_relse(bp);
2059	}
2060}
2061
2062/*
2063 * Add a buffer to the delayed write list.
2064 *
2065 * This queues a buffer for writeout if it hasn't already been.  Note that
2066 * neither this routine nor the buffer list submission functions perform
2067 * any internal synchronization.  It is expected that the lists are thread-local
2068 * to the callers.
2069 *
2070 * Returns true if we queued up the buffer, or false if it already had
2071 * been on the buffer list.
2072 */
2073bool
2074xfs_buf_delwri_queue(
2075	struct xfs_buf		*bp,
2076	struct list_head	*list)
2077{
2078	ASSERT(xfs_buf_islocked(bp));
2079	ASSERT(!(bp->b_flags & XBF_READ));
2080
2081	/*
2082	 * If the buffer is already marked delwri it already is queued up
2083	 * by someone else for imediate writeout.  Just ignore it in that
2084	 * case.
2085	 */
2086	if (bp->b_flags & _XBF_DELWRI_Q) {
2087		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
2088		return false;
2089	}
2090
2091	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
2092
2093	/*
2094	 * If a buffer gets written out synchronously or marked stale while it
2095	 * is on a delwri list we lazily remove it. To do this, the other party
2096	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
2097	 * It remains referenced and on the list.  In a rare corner case it
2098	 * might get readded to a delwri list after the synchronous writeout, in
2099	 * which case we need just need to re-add the flag here.
2100	 */
2101	bp->b_flags |= _XBF_DELWRI_Q;
2102	if (list_empty(&bp->b_list)) {
2103		atomic_inc(&bp->b_hold);
2104		list_add_tail(&bp->b_list, list);
2105	}
2106
2107	return true;
2108}
2109
2110/*
2111 * Compare function is more complex than it needs to be because
2112 * the return value is only 32 bits and we are doing comparisons
2113 * on 64 bit values
2114 */
2115static int
2116xfs_buf_cmp(
2117	void			*priv,
2118	const struct list_head	*a,
2119	const struct list_head	*b)
2120{
2121	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
2122	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
2123	xfs_daddr_t		diff;
2124
2125	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
2126	if (diff < 0)
2127		return -1;
2128	if (diff > 0)
2129		return 1;
2130	return 0;
2131}
2132
2133/*
2134 * Submit buffers for write. If wait_list is specified, the buffers are
2135 * submitted using sync I/O and placed on the wait list such that the caller can
2136 * iowait each buffer. Otherwise async I/O is used and the buffers are released
2137 * at I/O completion time. In either case, buffers remain locked until I/O
2138 * completes and the buffer is released from the queue.
2139 */
2140static int
2141xfs_buf_delwri_submit_buffers(
2142	struct list_head	*buffer_list,
2143	struct list_head	*wait_list)
2144{
2145	struct xfs_buf		*bp, *n;
2146	int			pinned = 0;
2147	struct blk_plug		plug;
2148
2149	list_sort(NULL, buffer_list, xfs_buf_cmp);
2150
2151	blk_start_plug(&plug);
2152	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
2153		if (!wait_list) {
2154			if (xfs_buf_ispinned(bp)) {
2155				pinned++;
2156				continue;
2157			}
2158			if (!xfs_buf_trylock(bp))
2159				continue;
2160		} else {
2161			xfs_buf_lock(bp);
2162		}
2163
2164		/*
2165		 * Someone else might have written the buffer synchronously or
2166		 * marked it stale in the meantime.  In that case only the
2167		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
2168		 * reference and remove it from the list here.
2169		 */
2170		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
2171			list_del_init(&bp->b_list);
2172			xfs_buf_relse(bp);
2173			continue;
2174		}
2175
2176		trace_xfs_buf_delwri_split(bp, _RET_IP_);
2177
2178		/*
2179		 * If we have a wait list, each buffer (and associated delwri
2180		 * queue reference) transfers to it and is submitted
2181		 * synchronously. Otherwise, drop the buffer from the delwri
2182		 * queue and submit async.
2183		 */
2184		bp->b_flags &= ~_XBF_DELWRI_Q;
2185		bp->b_flags |= XBF_WRITE;
2186		if (wait_list) {
2187			bp->b_flags &= ~XBF_ASYNC;
2188			list_move_tail(&bp->b_list, wait_list);
2189		} else {
2190			bp->b_flags |= XBF_ASYNC;
2191			list_del_init(&bp->b_list);
2192		}
2193		__xfs_buf_submit(bp, false);
2194	}
2195	blk_finish_plug(&plug);
2196
2197	return pinned;
2198}
2199
2200/*
2201 * Write out a buffer list asynchronously.
2202 *
2203 * This will take the @buffer_list, write all non-locked and non-pinned buffers
2204 * out and not wait for I/O completion on any of the buffers.  This interface
2205 * is only safely useable for callers that can track I/O completion by higher
2206 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
2207 * function.
2208 *
2209 * Note: this function will skip buffers it would block on, and in doing so
2210 * leaves them on @buffer_list so they can be retried on a later pass. As such,
2211 * it is up to the caller to ensure that the buffer list is fully submitted or
2212 * cancelled appropriately when they are finished with the list. Failure to
2213 * cancel or resubmit the list until it is empty will result in leaked buffers
2214 * at unmount time.
2215 */
2216int
2217xfs_buf_delwri_submit_nowait(
2218	struct list_head	*buffer_list)
2219{
2220	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
2221}
2222
2223/*
2224 * Write out a buffer list synchronously.
2225 *
2226 * This will take the @buffer_list, write all buffers out and wait for I/O
2227 * completion on all of the buffers. @buffer_list is consumed by the function,
2228 * so callers must have some other way of tracking buffers if they require such
2229 * functionality.
2230 */
2231int
2232xfs_buf_delwri_submit(
2233	struct list_head	*buffer_list)
2234{
2235	LIST_HEAD		(wait_list);
2236	int			error = 0, error2;
2237	struct xfs_buf		*bp;
2238
2239	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
2240
2241	/* Wait for IO to complete. */
2242	while (!list_empty(&wait_list)) {
2243		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2244
2245		list_del_init(&bp->b_list);
2246
2247		/*
2248		 * Wait on the locked buffer, check for errors and unlock and
2249		 * release the delwri queue reference.
2250		 */
2251		error2 = xfs_buf_iowait(bp);
2252		xfs_buf_relse(bp);
2253		if (!error)
2254			error = error2;
2255	}
2256
2257	return error;
2258}
2259
2260/*
2261 * Push a single buffer on a delwri queue.
2262 *
2263 * The purpose of this function is to submit a single buffer of a delwri queue
2264 * and return with the buffer still on the original queue. The waiting delwri
2265 * buffer submission infrastructure guarantees transfer of the delwri queue
2266 * buffer reference to a temporary wait list. We reuse this infrastructure to
2267 * transfer the buffer back to the original queue.
2268 *
2269 * Note the buffer transitions from the queued state, to the submitted and wait
2270 * listed state and back to the queued state during this call. The buffer
2271 * locking and queue management logic between _delwri_pushbuf() and
2272 * _delwri_queue() guarantee that the buffer cannot be queued to another list
2273 * before returning.
2274 */
2275int
2276xfs_buf_delwri_pushbuf(
2277	struct xfs_buf		*bp,
2278	struct list_head	*buffer_list)
2279{
2280	LIST_HEAD		(submit_list);
2281	int			error;
2282
2283	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
2284
2285	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
2286
2287	/*
2288	 * Isolate the buffer to a new local list so we can submit it for I/O
2289	 * independently from the rest of the original list.
2290	 */
2291	xfs_buf_lock(bp);
2292	list_move(&bp->b_list, &submit_list);
2293	xfs_buf_unlock(bp);
2294
2295	/*
2296	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
2297	 * the buffer on the wait list with the original reference. Rather than
2298	 * bounce the buffer from a local wait list back to the original list
2299	 * after I/O completion, reuse the original list as the wait list.
2300	 */
2301	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
2302
2303	/*
2304	 * The buffer is now locked, under I/O and wait listed on the original
2305	 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
2306	 * return with the buffer unlocked and on the original queue.
2307	 */
2308	error = xfs_buf_iowait(bp);
2309	bp->b_flags |= _XBF_DELWRI_Q;
2310	xfs_buf_unlock(bp);
2311
2312	return error;
2313}
2314
2315int __init
2316xfs_buf_init(void)
2317{
2318	xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
2319					 SLAB_HWCACHE_ALIGN |
2320					 SLAB_RECLAIM_ACCOUNT |
2321					 SLAB_MEM_SPREAD,
2322					 NULL);
2323	if (!xfs_buf_zone)
2324		goto out;
2325
2326	return 0;
2327
2328 out:
2329	return -ENOMEM;
2330}
2331
2332void
2333xfs_buf_terminate(void)
2334{
2335	kmem_cache_destroy(xfs_buf_zone);
2336}
2337
2338void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
2339{
2340	/*
2341	 * Set the lru reference count to 0 based on the error injection tag.
2342	 * This allows userspace to disrupt buffer caching for debug/testing
2343	 * purposes.
2344	 */
2345	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
2346		lru_ref = 0;
2347
2348	atomic_set(&bp->b_lru_ref, lru_ref);
2349}
2350
2351/*
2352 * Verify an on-disk magic value against the magic value specified in the
2353 * verifier structure. The verifier magic is in disk byte order so the caller is
2354 * expected to pass the value directly from disk.
2355 */
2356bool
2357xfs_verify_magic(
2358	struct xfs_buf		*bp,
2359	__be32			dmagic)
2360{
2361	struct xfs_mount	*mp = bp->b_mount;
2362	int			idx;
2363
2364	idx = xfs_sb_version_hascrc(&mp->m_sb);
2365	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
2366		return false;
2367	return dmagic == bp->b_ops->magic[idx];
2368}
2369/*
2370 * Verify an on-disk magic value against the magic value specified in the
2371 * verifier structure. The verifier magic is in disk byte order so the caller is
2372 * expected to pass the value directly from disk.
2373 */
2374bool
2375xfs_verify_magic16(
2376	struct xfs_buf		*bp,
2377	__be16			dmagic)
2378{
2379	struct xfs_mount	*mp = bp->b_mount;
2380	int			idx;
2381
2382	idx = xfs_sb_version_hascrc(&mp->m_sb);
2383	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
2384		return false;
2385	return dmagic == bp->b_ops->magic16[idx];
2386}
2387