162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (c) 2000-2006 Silicon Graphics, Inc. 462306a36Sopenharmony_ci * All Rights Reserved. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include "xfs.h" 762306a36Sopenharmony_ci#include <linux/backing-dev.h> 862306a36Sopenharmony_ci#include <linux/dax.h> 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#include "xfs_shared.h" 1162306a36Sopenharmony_ci#include "xfs_format.h" 1262306a36Sopenharmony_ci#include "xfs_log_format.h" 1362306a36Sopenharmony_ci#include "xfs_trans_resv.h" 1462306a36Sopenharmony_ci#include "xfs_mount.h" 1562306a36Sopenharmony_ci#include "xfs_trace.h" 1662306a36Sopenharmony_ci#include "xfs_log.h" 1762306a36Sopenharmony_ci#include "xfs_log_recover.h" 1862306a36Sopenharmony_ci#include "xfs_log_priv.h" 1962306a36Sopenharmony_ci#include "xfs_trans.h" 2062306a36Sopenharmony_ci#include "xfs_buf_item.h" 2162306a36Sopenharmony_ci#include "xfs_errortag.h" 2262306a36Sopenharmony_ci#include "xfs_error.h" 2362306a36Sopenharmony_ci#include "xfs_ag.h" 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_cistruct kmem_cache *xfs_buf_cache; 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci/* 2862306a36Sopenharmony_ci * Locking orders 2962306a36Sopenharmony_ci * 3062306a36Sopenharmony_ci * xfs_buf_ioacct_inc: 3162306a36Sopenharmony_ci * xfs_buf_ioacct_dec: 3262306a36Sopenharmony_ci * b_sema (caller holds) 3362306a36Sopenharmony_ci * b_lock 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * xfs_buf_stale: 3662306a36Sopenharmony_ci * b_sema (caller holds) 3762306a36Sopenharmony_ci * b_lock 3862306a36Sopenharmony_ci * lru_lock 3962306a36Sopenharmony_ci * 4062306a36Sopenharmony_ci * xfs_buf_rele: 4162306a36Sopenharmony_ci * b_lock 4262306a36Sopenharmony_ci * pag_buf_lock 4362306a36Sopenharmony_ci * lru_lock 4462306a36Sopenharmony_ci * 4562306a36Sopenharmony_ci * xfs_buftarg_drain_rele 4662306a36Sopenharmony_ci * lru_lock 4762306a36Sopenharmony_ci * b_lock (trylock due to inversion) 4862306a36Sopenharmony_ci * 4962306a36Sopenharmony_ci * xfs_buftarg_isolate 5062306a36Sopenharmony_ci * lru_lock 5162306a36Sopenharmony_ci * b_lock (trylock due to inversion) 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_cistatic int __xfs_buf_submit(struct xfs_buf *bp, bool wait); 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_cistatic inline int 5762306a36Sopenharmony_cixfs_buf_submit( 5862306a36Sopenharmony_ci struct xfs_buf *bp) 5962306a36Sopenharmony_ci{ 6062306a36Sopenharmony_ci return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); 6162306a36Sopenharmony_ci} 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_cistatic inline int 6462306a36Sopenharmony_cixfs_buf_is_vmapped( 6562306a36Sopenharmony_ci struct xfs_buf *bp) 6662306a36Sopenharmony_ci{ 6762306a36Sopenharmony_ci /* 6862306a36Sopenharmony_ci * Return true if the buffer is vmapped. 6962306a36Sopenharmony_ci * 7062306a36Sopenharmony_ci * b_addr is null if the buffer is not mapped, but the code is clever 7162306a36Sopenharmony_ci * enough to know it doesn't have to map a single page, so the check has 7262306a36Sopenharmony_ci * to be both for b_addr and bp->b_page_count > 1. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_ci return bp->b_addr && bp->b_page_count > 1; 7562306a36Sopenharmony_ci} 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_cistatic inline int 7862306a36Sopenharmony_cixfs_buf_vmap_len( 7962306a36Sopenharmony_ci struct xfs_buf *bp) 8062306a36Sopenharmony_ci{ 8162306a36Sopenharmony_ci return (bp->b_page_count * PAGE_SIZE); 8262306a36Sopenharmony_ci} 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci/* 8562306a36Sopenharmony_ci * Bump the I/O in flight count on the buftarg if we haven't yet done so for 8662306a36Sopenharmony_ci * this buffer. The count is incremented once per buffer (per hold cycle) 8762306a36Sopenharmony_ci * because the corresponding decrement is deferred to buffer release. Buffers 8862306a36Sopenharmony_ci * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 8962306a36Sopenharmony_ci * tracking adds unnecessary overhead. This is used for sychronization purposes 9062306a36Sopenharmony_ci * with unmount (see xfs_buftarg_drain()), so all we really need is a count of 9162306a36Sopenharmony_ci * in-flight buffers. 9262306a36Sopenharmony_ci * 9362306a36Sopenharmony_ci * Buffers that are never released (e.g., superblock, iclog buffers) must set 9462306a36Sopenharmony_ci * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 9562306a36Sopenharmony_ci * never reaches zero and unmount hangs indefinitely. 9662306a36Sopenharmony_ci */ 9762306a36Sopenharmony_cistatic inline void 9862306a36Sopenharmony_cixfs_buf_ioacct_inc( 9962306a36Sopenharmony_ci struct xfs_buf *bp) 10062306a36Sopenharmony_ci{ 10162306a36Sopenharmony_ci if (bp->b_flags & XBF_NO_IOACCT) 10262306a36Sopenharmony_ci return; 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci ASSERT(bp->b_flags & XBF_ASYNC); 10562306a36Sopenharmony_ci spin_lock(&bp->b_lock); 10662306a36Sopenharmony_ci if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 10762306a36Sopenharmony_ci bp->b_state |= XFS_BSTATE_IN_FLIGHT; 10862306a36Sopenharmony_ci percpu_counter_inc(&bp->b_target->bt_io_count); 10962306a36Sopenharmony_ci } 11062306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 11162306a36Sopenharmony_ci} 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci/* 11462306a36Sopenharmony_ci * Clear the in-flight state on a buffer about to be released to the LRU or 11562306a36Sopenharmony_ci * freed and unaccount from the buftarg. 11662306a36Sopenharmony_ci */ 11762306a36Sopenharmony_cistatic inline void 11862306a36Sopenharmony_ci__xfs_buf_ioacct_dec( 11962306a36Sopenharmony_ci struct xfs_buf *bp) 12062306a36Sopenharmony_ci{ 12162306a36Sopenharmony_ci lockdep_assert_held(&bp->b_lock); 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 12462306a36Sopenharmony_ci bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 12562306a36Sopenharmony_ci percpu_counter_dec(&bp->b_target->bt_io_count); 12662306a36Sopenharmony_ci } 12762306a36Sopenharmony_ci} 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_cistatic inline void 13062306a36Sopenharmony_cixfs_buf_ioacct_dec( 13162306a36Sopenharmony_ci struct xfs_buf *bp) 13262306a36Sopenharmony_ci{ 13362306a36Sopenharmony_ci spin_lock(&bp->b_lock); 13462306a36Sopenharmony_ci __xfs_buf_ioacct_dec(bp); 13562306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 13662306a36Sopenharmony_ci} 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci/* 13962306a36Sopenharmony_ci * When we mark a buffer stale, we remove the buffer from the LRU and clear the 14062306a36Sopenharmony_ci * b_lru_ref count so that the buffer is freed immediately when the buffer 14162306a36Sopenharmony_ci * reference count falls to zero. If the buffer is already on the LRU, we need 14262306a36Sopenharmony_ci * to remove the reference that LRU holds on the buffer. 14362306a36Sopenharmony_ci * 14462306a36Sopenharmony_ci * This prevents build-up of stale buffers on the LRU. 14562306a36Sopenharmony_ci */ 14662306a36Sopenharmony_civoid 14762306a36Sopenharmony_cixfs_buf_stale( 14862306a36Sopenharmony_ci struct xfs_buf *bp) 14962306a36Sopenharmony_ci{ 15062306a36Sopenharmony_ci ASSERT(xfs_buf_islocked(bp)); 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci bp->b_flags |= XBF_STALE; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci /* 15562306a36Sopenharmony_ci * Clear the delwri status so that a delwri queue walker will not 15662306a36Sopenharmony_ci * flush this buffer to disk now that it is stale. The delwri queue has 15762306a36Sopenharmony_ci * a reference to the buffer, so this is safe to do. 15862306a36Sopenharmony_ci */ 15962306a36Sopenharmony_ci bp->b_flags &= ~_XBF_DELWRI_Q; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci /* 16262306a36Sopenharmony_ci * Once the buffer is marked stale and unlocked, a subsequent lookup 16362306a36Sopenharmony_ci * could reset b_flags. There is no guarantee that the buffer is 16462306a36Sopenharmony_ci * unaccounted (released to LRU) before that occurs. Drop in-flight 16562306a36Sopenharmony_ci * status now to preserve accounting consistency. 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_ci spin_lock(&bp->b_lock); 16862306a36Sopenharmony_ci __xfs_buf_ioacct_dec(bp); 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci atomic_set(&bp->b_lru_ref, 0); 17162306a36Sopenharmony_ci if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 17262306a36Sopenharmony_ci (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) 17362306a36Sopenharmony_ci atomic_dec(&bp->b_hold); 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci ASSERT(atomic_read(&bp->b_hold) >= 1); 17662306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 17762306a36Sopenharmony_ci} 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_cistatic int 18062306a36Sopenharmony_cixfs_buf_get_maps( 18162306a36Sopenharmony_ci struct xfs_buf *bp, 18262306a36Sopenharmony_ci int map_count) 18362306a36Sopenharmony_ci{ 18462306a36Sopenharmony_ci ASSERT(bp->b_maps == NULL); 18562306a36Sopenharmony_ci bp->b_map_count = map_count; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci if (map_count == 1) { 18862306a36Sopenharmony_ci bp->b_maps = &bp->__b_map; 18962306a36Sopenharmony_ci return 0; 19062306a36Sopenharmony_ci } 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 19362306a36Sopenharmony_ci KM_NOFS); 19462306a36Sopenharmony_ci if (!bp->b_maps) 19562306a36Sopenharmony_ci return -ENOMEM; 19662306a36Sopenharmony_ci return 0; 19762306a36Sopenharmony_ci} 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci/* 20062306a36Sopenharmony_ci * Frees b_pages if it was allocated. 20162306a36Sopenharmony_ci */ 20262306a36Sopenharmony_cistatic void 20362306a36Sopenharmony_cixfs_buf_free_maps( 20462306a36Sopenharmony_ci struct xfs_buf *bp) 20562306a36Sopenharmony_ci{ 20662306a36Sopenharmony_ci if (bp->b_maps != &bp->__b_map) { 20762306a36Sopenharmony_ci kmem_free(bp->b_maps); 20862306a36Sopenharmony_ci bp->b_maps = NULL; 20962306a36Sopenharmony_ci } 21062306a36Sopenharmony_ci} 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_cistatic int 21362306a36Sopenharmony_ci_xfs_buf_alloc( 21462306a36Sopenharmony_ci struct xfs_buftarg *target, 21562306a36Sopenharmony_ci struct xfs_buf_map *map, 21662306a36Sopenharmony_ci int nmaps, 21762306a36Sopenharmony_ci xfs_buf_flags_t flags, 21862306a36Sopenharmony_ci struct xfs_buf **bpp) 21962306a36Sopenharmony_ci{ 22062306a36Sopenharmony_ci struct xfs_buf *bp; 22162306a36Sopenharmony_ci int error; 22262306a36Sopenharmony_ci int i; 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci *bpp = NULL; 22562306a36Sopenharmony_ci bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL); 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci /* 22862306a36Sopenharmony_ci * We don't want certain flags to appear in b_flags unless they are 22962306a36Sopenharmony_ci * specifically set by later operations on the buffer. 23062306a36Sopenharmony_ci */ 23162306a36Sopenharmony_ci flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci atomic_set(&bp->b_hold, 1); 23462306a36Sopenharmony_ci atomic_set(&bp->b_lru_ref, 1); 23562306a36Sopenharmony_ci init_completion(&bp->b_iowait); 23662306a36Sopenharmony_ci INIT_LIST_HEAD(&bp->b_lru); 23762306a36Sopenharmony_ci INIT_LIST_HEAD(&bp->b_list); 23862306a36Sopenharmony_ci INIT_LIST_HEAD(&bp->b_li_list); 23962306a36Sopenharmony_ci sema_init(&bp->b_sema, 0); /* held, no waiters */ 24062306a36Sopenharmony_ci spin_lock_init(&bp->b_lock); 24162306a36Sopenharmony_ci bp->b_target = target; 24262306a36Sopenharmony_ci bp->b_mount = target->bt_mount; 24362306a36Sopenharmony_ci bp->b_flags = flags; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci /* 24662306a36Sopenharmony_ci * Set length and io_length to the same value initially. 24762306a36Sopenharmony_ci * I/O routines should use io_length, which will be the same in 24862306a36Sopenharmony_ci * most cases but may be reset (e.g. XFS recovery). 24962306a36Sopenharmony_ci */ 25062306a36Sopenharmony_ci error = xfs_buf_get_maps(bp, nmaps); 25162306a36Sopenharmony_ci if (error) { 25262306a36Sopenharmony_ci kmem_cache_free(xfs_buf_cache, bp); 25362306a36Sopenharmony_ci return error; 25462306a36Sopenharmony_ci } 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci bp->b_rhash_key = map[0].bm_bn; 25762306a36Sopenharmony_ci bp->b_length = 0; 25862306a36Sopenharmony_ci for (i = 0; i < nmaps; i++) { 25962306a36Sopenharmony_ci bp->b_maps[i].bm_bn = map[i].bm_bn; 26062306a36Sopenharmony_ci bp->b_maps[i].bm_len = map[i].bm_len; 26162306a36Sopenharmony_ci bp->b_length += map[i].bm_len; 26262306a36Sopenharmony_ci } 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci atomic_set(&bp->b_pin_count, 0); 26562306a36Sopenharmony_ci init_waitqueue_head(&bp->b_waiters); 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci XFS_STATS_INC(bp->b_mount, xb_create); 26862306a36Sopenharmony_ci trace_xfs_buf_init(bp, _RET_IP_); 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci *bpp = bp; 27162306a36Sopenharmony_ci return 0; 27262306a36Sopenharmony_ci} 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_cistatic void 27562306a36Sopenharmony_cixfs_buf_free_pages( 27662306a36Sopenharmony_ci struct xfs_buf *bp) 27762306a36Sopenharmony_ci{ 27862306a36Sopenharmony_ci uint i; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci ASSERT(bp->b_flags & _XBF_PAGES); 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci if (xfs_buf_is_vmapped(bp)) 28362306a36Sopenharmony_ci vm_unmap_ram(bp->b_addr, bp->b_page_count); 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci for (i = 0; i < bp->b_page_count; i++) { 28662306a36Sopenharmony_ci if (bp->b_pages[i]) 28762306a36Sopenharmony_ci __free_page(bp->b_pages[i]); 28862306a36Sopenharmony_ci } 28962306a36Sopenharmony_ci mm_account_reclaimed_pages(bp->b_page_count); 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci if (bp->b_pages != bp->b_page_array) 29262306a36Sopenharmony_ci kmem_free(bp->b_pages); 29362306a36Sopenharmony_ci bp->b_pages = NULL; 29462306a36Sopenharmony_ci bp->b_flags &= ~_XBF_PAGES; 29562306a36Sopenharmony_ci} 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_cistatic void 29862306a36Sopenharmony_cixfs_buf_free_callback( 29962306a36Sopenharmony_ci struct callback_head *cb) 30062306a36Sopenharmony_ci{ 30162306a36Sopenharmony_ci struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci xfs_buf_free_maps(bp); 30462306a36Sopenharmony_ci kmem_cache_free(xfs_buf_cache, bp); 30562306a36Sopenharmony_ci} 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_cistatic void 30862306a36Sopenharmony_cixfs_buf_free( 30962306a36Sopenharmony_ci struct xfs_buf *bp) 31062306a36Sopenharmony_ci{ 31162306a36Sopenharmony_ci trace_xfs_buf_free(bp, _RET_IP_); 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci ASSERT(list_empty(&bp->b_lru)); 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci if (bp->b_flags & _XBF_PAGES) 31662306a36Sopenharmony_ci xfs_buf_free_pages(bp); 31762306a36Sopenharmony_ci else if (bp->b_flags & _XBF_KMEM) 31862306a36Sopenharmony_ci kmem_free(bp->b_addr); 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci call_rcu(&bp->b_rcu, xfs_buf_free_callback); 32162306a36Sopenharmony_ci} 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_cistatic int 32462306a36Sopenharmony_cixfs_buf_alloc_kmem( 32562306a36Sopenharmony_ci struct xfs_buf *bp, 32662306a36Sopenharmony_ci xfs_buf_flags_t flags) 32762306a36Sopenharmony_ci{ 32862306a36Sopenharmony_ci xfs_km_flags_t kmflag_mask = KM_NOFS; 32962306a36Sopenharmony_ci size_t size = BBTOB(bp->b_length); 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci /* Assure zeroed buffer for non-read cases. */ 33262306a36Sopenharmony_ci if (!(flags & XBF_READ)) 33362306a36Sopenharmony_ci kmflag_mask |= KM_ZERO; 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci bp->b_addr = kmem_alloc(size, kmflag_mask); 33662306a36Sopenharmony_ci if (!bp->b_addr) 33762306a36Sopenharmony_ci return -ENOMEM; 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 34062306a36Sopenharmony_ci ((unsigned long)bp->b_addr & PAGE_MASK)) { 34162306a36Sopenharmony_ci /* b_addr spans two pages - use alloc_page instead */ 34262306a36Sopenharmony_ci kmem_free(bp->b_addr); 34362306a36Sopenharmony_ci bp->b_addr = NULL; 34462306a36Sopenharmony_ci return -ENOMEM; 34562306a36Sopenharmony_ci } 34662306a36Sopenharmony_ci bp->b_offset = offset_in_page(bp->b_addr); 34762306a36Sopenharmony_ci bp->b_pages = bp->b_page_array; 34862306a36Sopenharmony_ci bp->b_pages[0] = kmem_to_page(bp->b_addr); 34962306a36Sopenharmony_ci bp->b_page_count = 1; 35062306a36Sopenharmony_ci bp->b_flags |= _XBF_KMEM; 35162306a36Sopenharmony_ci return 0; 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_cistatic int 35562306a36Sopenharmony_cixfs_buf_alloc_pages( 35662306a36Sopenharmony_ci struct xfs_buf *bp, 35762306a36Sopenharmony_ci xfs_buf_flags_t flags) 35862306a36Sopenharmony_ci{ 35962306a36Sopenharmony_ci gfp_t gfp_mask = __GFP_NOWARN; 36062306a36Sopenharmony_ci long filled = 0; 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci if (flags & XBF_READ_AHEAD) 36362306a36Sopenharmony_ci gfp_mask |= __GFP_NORETRY; 36462306a36Sopenharmony_ci else 36562306a36Sopenharmony_ci gfp_mask |= GFP_NOFS; 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci /* Make sure that we have a page list */ 36862306a36Sopenharmony_ci bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); 36962306a36Sopenharmony_ci if (bp->b_page_count <= XB_PAGES) { 37062306a36Sopenharmony_ci bp->b_pages = bp->b_page_array; 37162306a36Sopenharmony_ci } else { 37262306a36Sopenharmony_ci bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, 37362306a36Sopenharmony_ci gfp_mask); 37462306a36Sopenharmony_ci if (!bp->b_pages) 37562306a36Sopenharmony_ci return -ENOMEM; 37662306a36Sopenharmony_ci } 37762306a36Sopenharmony_ci bp->b_flags |= _XBF_PAGES; 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci /* Assure zeroed buffer for non-read cases. */ 38062306a36Sopenharmony_ci if (!(flags & XBF_READ)) 38162306a36Sopenharmony_ci gfp_mask |= __GFP_ZERO; 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci /* 38462306a36Sopenharmony_ci * Bulk filling of pages can take multiple calls. Not filling the entire 38562306a36Sopenharmony_ci * array is not an allocation failure, so don't back off if we get at 38662306a36Sopenharmony_ci * least one extra page. 38762306a36Sopenharmony_ci */ 38862306a36Sopenharmony_ci for (;;) { 38962306a36Sopenharmony_ci long last = filled; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, 39262306a36Sopenharmony_ci bp->b_pages); 39362306a36Sopenharmony_ci if (filled == bp->b_page_count) { 39462306a36Sopenharmony_ci XFS_STATS_INC(bp->b_mount, xb_page_found); 39562306a36Sopenharmony_ci break; 39662306a36Sopenharmony_ci } 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci if (filled != last) 39962306a36Sopenharmony_ci continue; 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci if (flags & XBF_READ_AHEAD) { 40262306a36Sopenharmony_ci xfs_buf_free_pages(bp); 40362306a36Sopenharmony_ci return -ENOMEM; 40462306a36Sopenharmony_ci } 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci XFS_STATS_INC(bp->b_mount, xb_page_retries); 40762306a36Sopenharmony_ci memalloc_retry_wait(gfp_mask); 40862306a36Sopenharmony_ci } 40962306a36Sopenharmony_ci return 0; 41062306a36Sopenharmony_ci} 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci/* 41362306a36Sopenharmony_ci * Map buffer into kernel address-space if necessary. 41462306a36Sopenharmony_ci */ 41562306a36Sopenharmony_ciSTATIC int 41662306a36Sopenharmony_ci_xfs_buf_map_pages( 41762306a36Sopenharmony_ci struct xfs_buf *bp, 41862306a36Sopenharmony_ci xfs_buf_flags_t flags) 41962306a36Sopenharmony_ci{ 42062306a36Sopenharmony_ci ASSERT(bp->b_flags & _XBF_PAGES); 42162306a36Sopenharmony_ci if (bp->b_page_count == 1) { 42262306a36Sopenharmony_ci /* A single page buffer is always mappable */ 42362306a36Sopenharmony_ci bp->b_addr = page_address(bp->b_pages[0]); 42462306a36Sopenharmony_ci } else if (flags & XBF_UNMAPPED) { 42562306a36Sopenharmony_ci bp->b_addr = NULL; 42662306a36Sopenharmony_ci } else { 42762306a36Sopenharmony_ci int retried = 0; 42862306a36Sopenharmony_ci unsigned nofs_flag; 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci /* 43162306a36Sopenharmony_ci * vm_map_ram() will allocate auxiliary structures (e.g. 43262306a36Sopenharmony_ci * pagetables) with GFP_KERNEL, yet we are likely to be under 43362306a36Sopenharmony_ci * GFP_NOFS context here. Hence we need to tell memory reclaim 43462306a36Sopenharmony_ci * that we are in such a context via PF_MEMALLOC_NOFS to prevent 43562306a36Sopenharmony_ci * memory reclaim re-entering the filesystem here and 43662306a36Sopenharmony_ci * potentially deadlocking. 43762306a36Sopenharmony_ci */ 43862306a36Sopenharmony_ci nofs_flag = memalloc_nofs_save(); 43962306a36Sopenharmony_ci do { 44062306a36Sopenharmony_ci bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 44162306a36Sopenharmony_ci -1); 44262306a36Sopenharmony_ci if (bp->b_addr) 44362306a36Sopenharmony_ci break; 44462306a36Sopenharmony_ci vm_unmap_aliases(); 44562306a36Sopenharmony_ci } while (retried++ <= 1); 44662306a36Sopenharmony_ci memalloc_nofs_restore(nofs_flag); 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci if (!bp->b_addr) 44962306a36Sopenharmony_ci return -ENOMEM; 45062306a36Sopenharmony_ci } 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci return 0; 45362306a36Sopenharmony_ci} 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci/* 45662306a36Sopenharmony_ci * Finding and Reading Buffers 45762306a36Sopenharmony_ci */ 45862306a36Sopenharmony_cistatic int 45962306a36Sopenharmony_ci_xfs_buf_obj_cmp( 46062306a36Sopenharmony_ci struct rhashtable_compare_arg *arg, 46162306a36Sopenharmony_ci const void *obj) 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci const struct xfs_buf_map *map = arg->key; 46462306a36Sopenharmony_ci const struct xfs_buf *bp = obj; 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci /* 46762306a36Sopenharmony_ci * The key hashing in the lookup path depends on the key being the 46862306a36Sopenharmony_ci * first element of the compare_arg, make sure to assert this. 46962306a36Sopenharmony_ci */ 47062306a36Sopenharmony_ci BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci if (bp->b_rhash_key != map->bm_bn) 47362306a36Sopenharmony_ci return 1; 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci if (unlikely(bp->b_length != map->bm_len)) { 47662306a36Sopenharmony_ci /* 47762306a36Sopenharmony_ci * found a block number match. If the range doesn't 47862306a36Sopenharmony_ci * match, the only way this is allowed is if the buffer 47962306a36Sopenharmony_ci * in the cache is stale and the transaction that made 48062306a36Sopenharmony_ci * it stale has not yet committed. i.e. we are 48162306a36Sopenharmony_ci * reallocating a busy extent. Skip this buffer and 48262306a36Sopenharmony_ci * continue searching for an exact match. 48362306a36Sopenharmony_ci */ 48462306a36Sopenharmony_ci if (!(map->bm_flags & XBM_LIVESCAN)) 48562306a36Sopenharmony_ci ASSERT(bp->b_flags & XBF_STALE); 48662306a36Sopenharmony_ci return 1; 48762306a36Sopenharmony_ci } 48862306a36Sopenharmony_ci return 0; 48962306a36Sopenharmony_ci} 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_cistatic const struct rhashtable_params xfs_buf_hash_params = { 49262306a36Sopenharmony_ci .min_size = 32, /* empty AGs have minimal footprint */ 49362306a36Sopenharmony_ci .nelem_hint = 16, 49462306a36Sopenharmony_ci .key_len = sizeof(xfs_daddr_t), 49562306a36Sopenharmony_ci .key_offset = offsetof(struct xfs_buf, b_rhash_key), 49662306a36Sopenharmony_ci .head_offset = offsetof(struct xfs_buf, b_rhash_head), 49762306a36Sopenharmony_ci .automatic_shrinking = true, 49862306a36Sopenharmony_ci .obj_cmpfn = _xfs_buf_obj_cmp, 49962306a36Sopenharmony_ci}; 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ciint 50262306a36Sopenharmony_cixfs_buf_hash_init( 50362306a36Sopenharmony_ci struct xfs_perag *pag) 50462306a36Sopenharmony_ci{ 50562306a36Sopenharmony_ci spin_lock_init(&pag->pag_buf_lock); 50662306a36Sopenharmony_ci return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 50762306a36Sopenharmony_ci} 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_civoid 51062306a36Sopenharmony_cixfs_buf_hash_destroy( 51162306a36Sopenharmony_ci struct xfs_perag *pag) 51262306a36Sopenharmony_ci{ 51362306a36Sopenharmony_ci rhashtable_destroy(&pag->pag_buf_hash); 51462306a36Sopenharmony_ci} 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_cistatic int 51762306a36Sopenharmony_cixfs_buf_map_verify( 51862306a36Sopenharmony_ci struct xfs_buftarg *btp, 51962306a36Sopenharmony_ci struct xfs_buf_map *map) 52062306a36Sopenharmony_ci{ 52162306a36Sopenharmony_ci xfs_daddr_t eofs; 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci /* Check for IOs smaller than the sector size / not sector aligned */ 52462306a36Sopenharmony_ci ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 52562306a36Sopenharmony_ci ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_ci /* 52862306a36Sopenharmony_ci * Corrupted block numbers can get through to here, unfortunately, so we 52962306a36Sopenharmony_ci * have to check that the buffer falls within the filesystem bounds. 53062306a36Sopenharmony_ci */ 53162306a36Sopenharmony_ci eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 53262306a36Sopenharmony_ci if (map->bm_bn < 0 || map->bm_bn >= eofs) { 53362306a36Sopenharmony_ci xfs_alert(btp->bt_mount, 53462306a36Sopenharmony_ci "%s: daddr 0x%llx out of range, EOFS 0x%llx", 53562306a36Sopenharmony_ci __func__, map->bm_bn, eofs); 53662306a36Sopenharmony_ci WARN_ON(1); 53762306a36Sopenharmony_ci return -EFSCORRUPTED; 53862306a36Sopenharmony_ci } 53962306a36Sopenharmony_ci return 0; 54062306a36Sopenharmony_ci} 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_cistatic int 54362306a36Sopenharmony_cixfs_buf_find_lock( 54462306a36Sopenharmony_ci struct xfs_buf *bp, 54562306a36Sopenharmony_ci xfs_buf_flags_t flags) 54662306a36Sopenharmony_ci{ 54762306a36Sopenharmony_ci if (flags & XBF_TRYLOCK) { 54862306a36Sopenharmony_ci if (!xfs_buf_trylock(bp)) { 54962306a36Sopenharmony_ci XFS_STATS_INC(bp->b_mount, xb_busy_locked); 55062306a36Sopenharmony_ci return -EAGAIN; 55162306a36Sopenharmony_ci } 55262306a36Sopenharmony_ci } else { 55362306a36Sopenharmony_ci xfs_buf_lock(bp); 55462306a36Sopenharmony_ci XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 55562306a36Sopenharmony_ci } 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_ci /* 55862306a36Sopenharmony_ci * if the buffer is stale, clear all the external state associated with 55962306a36Sopenharmony_ci * it. We need to keep flags such as how we allocated the buffer memory 56062306a36Sopenharmony_ci * intact here. 56162306a36Sopenharmony_ci */ 56262306a36Sopenharmony_ci if (bp->b_flags & XBF_STALE) { 56362306a36Sopenharmony_ci if (flags & XBF_LIVESCAN) { 56462306a36Sopenharmony_ci xfs_buf_unlock(bp); 56562306a36Sopenharmony_ci return -ENOENT; 56662306a36Sopenharmony_ci } 56762306a36Sopenharmony_ci ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 56862306a36Sopenharmony_ci bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 56962306a36Sopenharmony_ci bp->b_ops = NULL; 57062306a36Sopenharmony_ci } 57162306a36Sopenharmony_ci return 0; 57262306a36Sopenharmony_ci} 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_cistatic inline int 57562306a36Sopenharmony_cixfs_buf_lookup( 57662306a36Sopenharmony_ci struct xfs_perag *pag, 57762306a36Sopenharmony_ci struct xfs_buf_map *map, 57862306a36Sopenharmony_ci xfs_buf_flags_t flags, 57962306a36Sopenharmony_ci struct xfs_buf **bpp) 58062306a36Sopenharmony_ci{ 58162306a36Sopenharmony_ci struct xfs_buf *bp; 58262306a36Sopenharmony_ci int error; 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci rcu_read_lock(); 58562306a36Sopenharmony_ci bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); 58662306a36Sopenharmony_ci if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { 58762306a36Sopenharmony_ci rcu_read_unlock(); 58862306a36Sopenharmony_ci return -ENOENT; 58962306a36Sopenharmony_ci } 59062306a36Sopenharmony_ci rcu_read_unlock(); 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci error = xfs_buf_find_lock(bp, flags); 59362306a36Sopenharmony_ci if (error) { 59462306a36Sopenharmony_ci xfs_buf_rele(bp); 59562306a36Sopenharmony_ci return error; 59662306a36Sopenharmony_ci } 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci trace_xfs_buf_find(bp, flags, _RET_IP_); 59962306a36Sopenharmony_ci *bpp = bp; 60062306a36Sopenharmony_ci return 0; 60162306a36Sopenharmony_ci} 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci/* 60462306a36Sopenharmony_ci * Insert the new_bp into the hash table. This consumes the perag reference 60562306a36Sopenharmony_ci * taken for the lookup regardless of the result of the insert. 60662306a36Sopenharmony_ci */ 60762306a36Sopenharmony_cistatic int 60862306a36Sopenharmony_cixfs_buf_find_insert( 60962306a36Sopenharmony_ci struct xfs_buftarg *btp, 61062306a36Sopenharmony_ci struct xfs_perag *pag, 61162306a36Sopenharmony_ci struct xfs_buf_map *cmap, 61262306a36Sopenharmony_ci struct xfs_buf_map *map, 61362306a36Sopenharmony_ci int nmaps, 61462306a36Sopenharmony_ci xfs_buf_flags_t flags, 61562306a36Sopenharmony_ci struct xfs_buf **bpp) 61662306a36Sopenharmony_ci{ 61762306a36Sopenharmony_ci struct xfs_buf *new_bp; 61862306a36Sopenharmony_ci struct xfs_buf *bp; 61962306a36Sopenharmony_ci int error; 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 62262306a36Sopenharmony_ci if (error) 62362306a36Sopenharmony_ci goto out_drop_pag; 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci /* 62662306a36Sopenharmony_ci * For buffers that fit entirely within a single page, first attempt to 62762306a36Sopenharmony_ci * allocate the memory from the heap to minimise memory usage. If we 62862306a36Sopenharmony_ci * can't get heap memory for these small buffers, we fall back to using 62962306a36Sopenharmony_ci * the page allocator. 63062306a36Sopenharmony_ci */ 63162306a36Sopenharmony_ci if (BBTOB(new_bp->b_length) >= PAGE_SIZE || 63262306a36Sopenharmony_ci xfs_buf_alloc_kmem(new_bp, flags) < 0) { 63362306a36Sopenharmony_ci error = xfs_buf_alloc_pages(new_bp, flags); 63462306a36Sopenharmony_ci if (error) 63562306a36Sopenharmony_ci goto out_free_buf; 63662306a36Sopenharmony_ci } 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci spin_lock(&pag->pag_buf_lock); 63962306a36Sopenharmony_ci bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, 64062306a36Sopenharmony_ci &new_bp->b_rhash_head, xfs_buf_hash_params); 64162306a36Sopenharmony_ci if (IS_ERR(bp)) { 64262306a36Sopenharmony_ci error = PTR_ERR(bp); 64362306a36Sopenharmony_ci spin_unlock(&pag->pag_buf_lock); 64462306a36Sopenharmony_ci goto out_free_buf; 64562306a36Sopenharmony_ci } 64662306a36Sopenharmony_ci if (bp) { 64762306a36Sopenharmony_ci /* found an existing buffer */ 64862306a36Sopenharmony_ci atomic_inc(&bp->b_hold); 64962306a36Sopenharmony_ci spin_unlock(&pag->pag_buf_lock); 65062306a36Sopenharmony_ci error = xfs_buf_find_lock(bp, flags); 65162306a36Sopenharmony_ci if (error) 65262306a36Sopenharmony_ci xfs_buf_rele(bp); 65362306a36Sopenharmony_ci else 65462306a36Sopenharmony_ci *bpp = bp; 65562306a36Sopenharmony_ci goto out_free_buf; 65662306a36Sopenharmony_ci } 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci /* The new buffer keeps the perag reference until it is freed. */ 65962306a36Sopenharmony_ci new_bp->b_pag = pag; 66062306a36Sopenharmony_ci spin_unlock(&pag->pag_buf_lock); 66162306a36Sopenharmony_ci *bpp = new_bp; 66262306a36Sopenharmony_ci return 0; 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ciout_free_buf: 66562306a36Sopenharmony_ci xfs_buf_free(new_bp); 66662306a36Sopenharmony_ciout_drop_pag: 66762306a36Sopenharmony_ci xfs_perag_put(pag); 66862306a36Sopenharmony_ci return error; 66962306a36Sopenharmony_ci} 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci/* 67262306a36Sopenharmony_ci * Assembles a buffer covering the specified range. The code is optimised for 67362306a36Sopenharmony_ci * cache hits, as metadata intensive workloads will see 3 orders of magnitude 67462306a36Sopenharmony_ci * more hits than misses. 67562306a36Sopenharmony_ci */ 67662306a36Sopenharmony_ciint 67762306a36Sopenharmony_cixfs_buf_get_map( 67862306a36Sopenharmony_ci struct xfs_buftarg *btp, 67962306a36Sopenharmony_ci struct xfs_buf_map *map, 68062306a36Sopenharmony_ci int nmaps, 68162306a36Sopenharmony_ci xfs_buf_flags_t flags, 68262306a36Sopenharmony_ci struct xfs_buf **bpp) 68362306a36Sopenharmony_ci{ 68462306a36Sopenharmony_ci struct xfs_perag *pag; 68562306a36Sopenharmony_ci struct xfs_buf *bp = NULL; 68662306a36Sopenharmony_ci struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 68762306a36Sopenharmony_ci int error; 68862306a36Sopenharmony_ci int i; 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_ci if (flags & XBF_LIVESCAN) 69162306a36Sopenharmony_ci cmap.bm_flags |= XBM_LIVESCAN; 69262306a36Sopenharmony_ci for (i = 0; i < nmaps; i++) 69362306a36Sopenharmony_ci cmap.bm_len += map[i].bm_len; 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci error = xfs_buf_map_verify(btp, &cmap); 69662306a36Sopenharmony_ci if (error) 69762306a36Sopenharmony_ci return error; 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_ci pag = xfs_perag_get(btp->bt_mount, 70062306a36Sopenharmony_ci xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci error = xfs_buf_lookup(pag, &cmap, flags, &bp); 70362306a36Sopenharmony_ci if (error && error != -ENOENT) 70462306a36Sopenharmony_ci goto out_put_perag; 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci /* cache hits always outnumber misses by at least 10:1 */ 70762306a36Sopenharmony_ci if (unlikely(!bp)) { 70862306a36Sopenharmony_ci XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci if (flags & XBF_INCORE) 71162306a36Sopenharmony_ci goto out_put_perag; 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci /* xfs_buf_find_insert() consumes the perag reference. */ 71462306a36Sopenharmony_ci error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, 71562306a36Sopenharmony_ci flags, &bp); 71662306a36Sopenharmony_ci if (error) 71762306a36Sopenharmony_ci return error; 71862306a36Sopenharmony_ci } else { 71962306a36Sopenharmony_ci XFS_STATS_INC(btp->bt_mount, xb_get_locked); 72062306a36Sopenharmony_ci xfs_perag_put(pag); 72162306a36Sopenharmony_ci } 72262306a36Sopenharmony_ci 72362306a36Sopenharmony_ci /* We do not hold a perag reference anymore. */ 72462306a36Sopenharmony_ci if (!bp->b_addr) { 72562306a36Sopenharmony_ci error = _xfs_buf_map_pages(bp, flags); 72662306a36Sopenharmony_ci if (unlikely(error)) { 72762306a36Sopenharmony_ci xfs_warn_ratelimited(btp->bt_mount, 72862306a36Sopenharmony_ci "%s: failed to map %u pages", __func__, 72962306a36Sopenharmony_ci bp->b_page_count); 73062306a36Sopenharmony_ci xfs_buf_relse(bp); 73162306a36Sopenharmony_ci return error; 73262306a36Sopenharmony_ci } 73362306a36Sopenharmony_ci } 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_ci /* 73662306a36Sopenharmony_ci * Clear b_error if this is a lookup from a caller that doesn't expect 73762306a36Sopenharmony_ci * valid data to be found in the buffer. 73862306a36Sopenharmony_ci */ 73962306a36Sopenharmony_ci if (!(flags & XBF_READ)) 74062306a36Sopenharmony_ci xfs_buf_ioerror(bp, 0); 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_ci XFS_STATS_INC(btp->bt_mount, xb_get); 74362306a36Sopenharmony_ci trace_xfs_buf_get(bp, flags, _RET_IP_); 74462306a36Sopenharmony_ci *bpp = bp; 74562306a36Sopenharmony_ci return 0; 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_ciout_put_perag: 74862306a36Sopenharmony_ci xfs_perag_put(pag); 74962306a36Sopenharmony_ci return error; 75062306a36Sopenharmony_ci} 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ciint 75362306a36Sopenharmony_ci_xfs_buf_read( 75462306a36Sopenharmony_ci struct xfs_buf *bp, 75562306a36Sopenharmony_ci xfs_buf_flags_t flags) 75662306a36Sopenharmony_ci{ 75762306a36Sopenharmony_ci ASSERT(!(flags & XBF_WRITE)); 75862306a36Sopenharmony_ci ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 76162306a36Sopenharmony_ci bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci return xfs_buf_submit(bp); 76462306a36Sopenharmony_ci} 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci/* 76762306a36Sopenharmony_ci * Reverify a buffer found in cache without an attached ->b_ops. 76862306a36Sopenharmony_ci * 76962306a36Sopenharmony_ci * If the caller passed an ops structure and the buffer doesn't have ops 77062306a36Sopenharmony_ci * assigned, set the ops and use it to verify the contents. If verification 77162306a36Sopenharmony_ci * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 77262306a36Sopenharmony_ci * already in XBF_DONE state on entry. 77362306a36Sopenharmony_ci * 77462306a36Sopenharmony_ci * Under normal operations, every in-core buffer is verified on read I/O 77562306a36Sopenharmony_ci * completion. There are two scenarios that can lead to in-core buffers without 77662306a36Sopenharmony_ci * an assigned ->b_ops. The first is during log recovery of buffers on a V4 77762306a36Sopenharmony_ci * filesystem, though these buffers are purged at the end of recovery. The 77862306a36Sopenharmony_ci * other is online repair, which intentionally reads with a NULL buffer ops to 77962306a36Sopenharmony_ci * run several verifiers across an in-core buffer in order to establish buffer 78062306a36Sopenharmony_ci * type. If repair can't establish that, the buffer will be left in memory 78162306a36Sopenharmony_ci * with NULL buffer ops. 78262306a36Sopenharmony_ci */ 78362306a36Sopenharmony_ciint 78462306a36Sopenharmony_cixfs_buf_reverify( 78562306a36Sopenharmony_ci struct xfs_buf *bp, 78662306a36Sopenharmony_ci const struct xfs_buf_ops *ops) 78762306a36Sopenharmony_ci{ 78862306a36Sopenharmony_ci ASSERT(bp->b_flags & XBF_DONE); 78962306a36Sopenharmony_ci ASSERT(bp->b_error == 0); 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci if (!ops || bp->b_ops) 79262306a36Sopenharmony_ci return 0; 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci bp->b_ops = ops; 79562306a36Sopenharmony_ci bp->b_ops->verify_read(bp); 79662306a36Sopenharmony_ci if (bp->b_error) 79762306a36Sopenharmony_ci bp->b_flags &= ~XBF_DONE; 79862306a36Sopenharmony_ci return bp->b_error; 79962306a36Sopenharmony_ci} 80062306a36Sopenharmony_ci 80162306a36Sopenharmony_ciint 80262306a36Sopenharmony_cixfs_buf_read_map( 80362306a36Sopenharmony_ci struct xfs_buftarg *target, 80462306a36Sopenharmony_ci struct xfs_buf_map *map, 80562306a36Sopenharmony_ci int nmaps, 80662306a36Sopenharmony_ci xfs_buf_flags_t flags, 80762306a36Sopenharmony_ci struct xfs_buf **bpp, 80862306a36Sopenharmony_ci const struct xfs_buf_ops *ops, 80962306a36Sopenharmony_ci xfs_failaddr_t fa) 81062306a36Sopenharmony_ci{ 81162306a36Sopenharmony_ci struct xfs_buf *bp; 81262306a36Sopenharmony_ci int error; 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci flags |= XBF_READ; 81562306a36Sopenharmony_ci *bpp = NULL; 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 81862306a36Sopenharmony_ci if (error) 81962306a36Sopenharmony_ci return error; 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci trace_xfs_buf_read(bp, flags, _RET_IP_); 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_ci if (!(bp->b_flags & XBF_DONE)) { 82462306a36Sopenharmony_ci /* Initiate the buffer read and wait. */ 82562306a36Sopenharmony_ci XFS_STATS_INC(target->bt_mount, xb_get_read); 82662306a36Sopenharmony_ci bp->b_ops = ops; 82762306a36Sopenharmony_ci error = _xfs_buf_read(bp, flags); 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_ci /* Readahead iodone already dropped the buffer, so exit. */ 83062306a36Sopenharmony_ci if (flags & XBF_ASYNC) 83162306a36Sopenharmony_ci return 0; 83262306a36Sopenharmony_ci } else { 83362306a36Sopenharmony_ci /* Buffer already read; all we need to do is check it. */ 83462306a36Sopenharmony_ci error = xfs_buf_reverify(bp, ops); 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci /* Readahead already finished; drop the buffer and exit. */ 83762306a36Sopenharmony_ci if (flags & XBF_ASYNC) { 83862306a36Sopenharmony_ci xfs_buf_relse(bp); 83962306a36Sopenharmony_ci return 0; 84062306a36Sopenharmony_ci } 84162306a36Sopenharmony_ci 84262306a36Sopenharmony_ci /* We do not want read in the flags */ 84362306a36Sopenharmony_ci bp->b_flags &= ~XBF_READ; 84462306a36Sopenharmony_ci ASSERT(bp->b_ops != NULL || ops == NULL); 84562306a36Sopenharmony_ci } 84662306a36Sopenharmony_ci 84762306a36Sopenharmony_ci /* 84862306a36Sopenharmony_ci * If we've had a read error, then the contents of the buffer are 84962306a36Sopenharmony_ci * invalid and should not be used. To ensure that a followup read tries 85062306a36Sopenharmony_ci * to pull the buffer from disk again, we clear the XBF_DONE flag and 85162306a36Sopenharmony_ci * mark the buffer stale. This ensures that anyone who has a current 85262306a36Sopenharmony_ci * reference to the buffer will interpret it's contents correctly and 85362306a36Sopenharmony_ci * future cache lookups will also treat it as an empty, uninitialised 85462306a36Sopenharmony_ci * buffer. 85562306a36Sopenharmony_ci */ 85662306a36Sopenharmony_ci if (error) { 85762306a36Sopenharmony_ci /* 85862306a36Sopenharmony_ci * Check against log shutdown for error reporting because 85962306a36Sopenharmony_ci * metadata writeback may require a read first and we need to 86062306a36Sopenharmony_ci * report errors in metadata writeback until the log is shut 86162306a36Sopenharmony_ci * down. High level transaction read functions already check 86262306a36Sopenharmony_ci * against mount shutdown, anyway, so we only need to be 86362306a36Sopenharmony_ci * concerned about low level IO interactions here. 86462306a36Sopenharmony_ci */ 86562306a36Sopenharmony_ci if (!xlog_is_shutdown(target->bt_mount->m_log)) 86662306a36Sopenharmony_ci xfs_buf_ioerror_alert(bp, fa); 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci bp->b_flags &= ~XBF_DONE; 86962306a36Sopenharmony_ci xfs_buf_stale(bp); 87062306a36Sopenharmony_ci xfs_buf_relse(bp); 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_ci /* bad CRC means corrupted metadata */ 87362306a36Sopenharmony_ci if (error == -EFSBADCRC) 87462306a36Sopenharmony_ci error = -EFSCORRUPTED; 87562306a36Sopenharmony_ci return error; 87662306a36Sopenharmony_ci } 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_ci *bpp = bp; 87962306a36Sopenharmony_ci return 0; 88062306a36Sopenharmony_ci} 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci/* 88362306a36Sopenharmony_ci * If we are not low on memory then do the readahead in a deadlock 88462306a36Sopenharmony_ci * safe manner. 88562306a36Sopenharmony_ci */ 88662306a36Sopenharmony_civoid 88762306a36Sopenharmony_cixfs_buf_readahead_map( 88862306a36Sopenharmony_ci struct xfs_buftarg *target, 88962306a36Sopenharmony_ci struct xfs_buf_map *map, 89062306a36Sopenharmony_ci int nmaps, 89162306a36Sopenharmony_ci const struct xfs_buf_ops *ops) 89262306a36Sopenharmony_ci{ 89362306a36Sopenharmony_ci struct xfs_buf *bp; 89462306a36Sopenharmony_ci 89562306a36Sopenharmony_ci xfs_buf_read_map(target, map, nmaps, 89662306a36Sopenharmony_ci XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, 89762306a36Sopenharmony_ci __this_address); 89862306a36Sopenharmony_ci} 89962306a36Sopenharmony_ci 90062306a36Sopenharmony_ci/* 90162306a36Sopenharmony_ci * Read an uncached buffer from disk. Allocates and returns a locked 90262306a36Sopenharmony_ci * buffer containing the disk contents or nothing. Uncached buffers always have 90362306a36Sopenharmony_ci * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 90462306a36Sopenharmony_ci * is cached or uncached during fault diagnosis. 90562306a36Sopenharmony_ci */ 90662306a36Sopenharmony_ciint 90762306a36Sopenharmony_cixfs_buf_read_uncached( 90862306a36Sopenharmony_ci struct xfs_buftarg *target, 90962306a36Sopenharmony_ci xfs_daddr_t daddr, 91062306a36Sopenharmony_ci size_t numblks, 91162306a36Sopenharmony_ci xfs_buf_flags_t flags, 91262306a36Sopenharmony_ci struct xfs_buf **bpp, 91362306a36Sopenharmony_ci const struct xfs_buf_ops *ops) 91462306a36Sopenharmony_ci{ 91562306a36Sopenharmony_ci struct xfs_buf *bp; 91662306a36Sopenharmony_ci int error; 91762306a36Sopenharmony_ci 91862306a36Sopenharmony_ci *bpp = NULL; 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci error = xfs_buf_get_uncached(target, numblks, flags, &bp); 92162306a36Sopenharmony_ci if (error) 92262306a36Sopenharmony_ci return error; 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci /* set up the buffer for a read IO */ 92562306a36Sopenharmony_ci ASSERT(bp->b_map_count == 1); 92662306a36Sopenharmony_ci bp->b_rhash_key = XFS_BUF_DADDR_NULL; 92762306a36Sopenharmony_ci bp->b_maps[0].bm_bn = daddr; 92862306a36Sopenharmony_ci bp->b_flags |= XBF_READ; 92962306a36Sopenharmony_ci bp->b_ops = ops; 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci xfs_buf_submit(bp); 93262306a36Sopenharmony_ci if (bp->b_error) { 93362306a36Sopenharmony_ci error = bp->b_error; 93462306a36Sopenharmony_ci xfs_buf_relse(bp); 93562306a36Sopenharmony_ci return error; 93662306a36Sopenharmony_ci } 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci *bpp = bp; 93962306a36Sopenharmony_ci return 0; 94062306a36Sopenharmony_ci} 94162306a36Sopenharmony_ci 94262306a36Sopenharmony_ciint 94362306a36Sopenharmony_cixfs_buf_get_uncached( 94462306a36Sopenharmony_ci struct xfs_buftarg *target, 94562306a36Sopenharmony_ci size_t numblks, 94662306a36Sopenharmony_ci xfs_buf_flags_t flags, 94762306a36Sopenharmony_ci struct xfs_buf **bpp) 94862306a36Sopenharmony_ci{ 94962306a36Sopenharmony_ci int error; 95062306a36Sopenharmony_ci struct xfs_buf *bp; 95162306a36Sopenharmony_ci DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_ci *bpp = NULL; 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci /* flags might contain irrelevant bits, pass only what we care about */ 95662306a36Sopenharmony_ci error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); 95762306a36Sopenharmony_ci if (error) 95862306a36Sopenharmony_ci return error; 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci error = xfs_buf_alloc_pages(bp, flags); 96162306a36Sopenharmony_ci if (error) 96262306a36Sopenharmony_ci goto fail_free_buf; 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci error = _xfs_buf_map_pages(bp, 0); 96562306a36Sopenharmony_ci if (unlikely(error)) { 96662306a36Sopenharmony_ci xfs_warn(target->bt_mount, 96762306a36Sopenharmony_ci "%s: failed to map pages", __func__); 96862306a36Sopenharmony_ci goto fail_free_buf; 96962306a36Sopenharmony_ci } 97062306a36Sopenharmony_ci 97162306a36Sopenharmony_ci trace_xfs_buf_get_uncached(bp, _RET_IP_); 97262306a36Sopenharmony_ci *bpp = bp; 97362306a36Sopenharmony_ci return 0; 97462306a36Sopenharmony_ci 97562306a36Sopenharmony_cifail_free_buf: 97662306a36Sopenharmony_ci xfs_buf_free(bp); 97762306a36Sopenharmony_ci return error; 97862306a36Sopenharmony_ci} 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci/* 98162306a36Sopenharmony_ci * Increment reference count on buffer, to hold the buffer concurrently 98262306a36Sopenharmony_ci * with another thread which may release (free) the buffer asynchronously. 98362306a36Sopenharmony_ci * Must hold the buffer already to call this function. 98462306a36Sopenharmony_ci */ 98562306a36Sopenharmony_civoid 98662306a36Sopenharmony_cixfs_buf_hold( 98762306a36Sopenharmony_ci struct xfs_buf *bp) 98862306a36Sopenharmony_ci{ 98962306a36Sopenharmony_ci trace_xfs_buf_hold(bp, _RET_IP_); 99062306a36Sopenharmony_ci atomic_inc(&bp->b_hold); 99162306a36Sopenharmony_ci} 99262306a36Sopenharmony_ci 99362306a36Sopenharmony_ci/* 99462306a36Sopenharmony_ci * Release a hold on the specified buffer. If the hold count is 1, the buffer is 99562306a36Sopenharmony_ci * placed on LRU or freed (depending on b_lru_ref). 99662306a36Sopenharmony_ci */ 99762306a36Sopenharmony_civoid 99862306a36Sopenharmony_cixfs_buf_rele( 99962306a36Sopenharmony_ci struct xfs_buf *bp) 100062306a36Sopenharmony_ci{ 100162306a36Sopenharmony_ci struct xfs_perag *pag = bp->b_pag; 100262306a36Sopenharmony_ci bool release; 100362306a36Sopenharmony_ci bool freebuf = false; 100462306a36Sopenharmony_ci 100562306a36Sopenharmony_ci trace_xfs_buf_rele(bp, _RET_IP_); 100662306a36Sopenharmony_ci 100762306a36Sopenharmony_ci if (!pag) { 100862306a36Sopenharmony_ci ASSERT(list_empty(&bp->b_lru)); 100962306a36Sopenharmony_ci if (atomic_dec_and_test(&bp->b_hold)) { 101062306a36Sopenharmony_ci xfs_buf_ioacct_dec(bp); 101162306a36Sopenharmony_ci xfs_buf_free(bp); 101262306a36Sopenharmony_ci } 101362306a36Sopenharmony_ci return; 101462306a36Sopenharmony_ci } 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci ASSERT(atomic_read(&bp->b_hold) > 0); 101762306a36Sopenharmony_ci 101862306a36Sopenharmony_ci /* 101962306a36Sopenharmony_ci * We grab the b_lock here first to serialise racing xfs_buf_rele() 102062306a36Sopenharmony_ci * calls. The pag_buf_lock being taken on the last reference only 102162306a36Sopenharmony_ci * serialises against racing lookups in xfs_buf_find(). IOWs, the second 102262306a36Sopenharmony_ci * to last reference we drop here is not serialised against the last 102362306a36Sopenharmony_ci * reference until we take bp->b_lock. Hence if we don't grab b_lock 102462306a36Sopenharmony_ci * first, the last "release" reference can win the race to the lock and 102562306a36Sopenharmony_ci * free the buffer before the second-to-last reference is processed, 102662306a36Sopenharmony_ci * leading to a use-after-free scenario. 102762306a36Sopenharmony_ci */ 102862306a36Sopenharmony_ci spin_lock(&bp->b_lock); 102962306a36Sopenharmony_ci release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 103062306a36Sopenharmony_ci if (!release) { 103162306a36Sopenharmony_ci /* 103262306a36Sopenharmony_ci * Drop the in-flight state if the buffer is already on the LRU 103362306a36Sopenharmony_ci * and it holds the only reference. This is racy because we 103462306a36Sopenharmony_ci * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 103562306a36Sopenharmony_ci * ensures the decrement occurs only once per-buf. 103662306a36Sopenharmony_ci */ 103762306a36Sopenharmony_ci if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 103862306a36Sopenharmony_ci __xfs_buf_ioacct_dec(bp); 103962306a36Sopenharmony_ci goto out_unlock; 104062306a36Sopenharmony_ci } 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci /* the last reference has been dropped ... */ 104362306a36Sopenharmony_ci __xfs_buf_ioacct_dec(bp); 104462306a36Sopenharmony_ci if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 104562306a36Sopenharmony_ci /* 104662306a36Sopenharmony_ci * If the buffer is added to the LRU take a new reference to the 104762306a36Sopenharmony_ci * buffer for the LRU and clear the (now stale) dispose list 104862306a36Sopenharmony_ci * state flag 104962306a36Sopenharmony_ci */ 105062306a36Sopenharmony_ci if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { 105162306a36Sopenharmony_ci bp->b_state &= ~XFS_BSTATE_DISPOSE; 105262306a36Sopenharmony_ci atomic_inc(&bp->b_hold); 105362306a36Sopenharmony_ci } 105462306a36Sopenharmony_ci spin_unlock(&pag->pag_buf_lock); 105562306a36Sopenharmony_ci } else { 105662306a36Sopenharmony_ci /* 105762306a36Sopenharmony_ci * most of the time buffers will already be removed from the 105862306a36Sopenharmony_ci * LRU, so optimise that case by checking for the 105962306a36Sopenharmony_ci * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 106062306a36Sopenharmony_ci * was on was the disposal list 106162306a36Sopenharmony_ci */ 106262306a36Sopenharmony_ci if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 106362306a36Sopenharmony_ci list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); 106462306a36Sopenharmony_ci } else { 106562306a36Sopenharmony_ci ASSERT(list_empty(&bp->b_lru)); 106662306a36Sopenharmony_ci } 106762306a36Sopenharmony_ci 106862306a36Sopenharmony_ci ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 106962306a36Sopenharmony_ci rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 107062306a36Sopenharmony_ci xfs_buf_hash_params); 107162306a36Sopenharmony_ci spin_unlock(&pag->pag_buf_lock); 107262306a36Sopenharmony_ci xfs_perag_put(pag); 107362306a36Sopenharmony_ci freebuf = true; 107462306a36Sopenharmony_ci } 107562306a36Sopenharmony_ci 107662306a36Sopenharmony_ciout_unlock: 107762306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ci if (freebuf) 108062306a36Sopenharmony_ci xfs_buf_free(bp); 108162306a36Sopenharmony_ci} 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci 108462306a36Sopenharmony_ci/* 108562306a36Sopenharmony_ci * Lock a buffer object, if it is not already locked. 108662306a36Sopenharmony_ci * 108762306a36Sopenharmony_ci * If we come across a stale, pinned, locked buffer, we know that we are 108862306a36Sopenharmony_ci * being asked to lock a buffer that has been reallocated. Because it is 108962306a36Sopenharmony_ci * pinned, we know that the log has not been pushed to disk and hence it 109062306a36Sopenharmony_ci * will still be locked. Rather than continuing to have trylock attempts 109162306a36Sopenharmony_ci * fail until someone else pushes the log, push it ourselves before 109262306a36Sopenharmony_ci * returning. This means that the xfsaild will not get stuck trying 109362306a36Sopenharmony_ci * to push on stale inode buffers. 109462306a36Sopenharmony_ci */ 109562306a36Sopenharmony_ciint 109662306a36Sopenharmony_cixfs_buf_trylock( 109762306a36Sopenharmony_ci struct xfs_buf *bp) 109862306a36Sopenharmony_ci{ 109962306a36Sopenharmony_ci int locked; 110062306a36Sopenharmony_ci 110162306a36Sopenharmony_ci locked = down_trylock(&bp->b_sema) == 0; 110262306a36Sopenharmony_ci if (locked) 110362306a36Sopenharmony_ci trace_xfs_buf_trylock(bp, _RET_IP_); 110462306a36Sopenharmony_ci else 110562306a36Sopenharmony_ci trace_xfs_buf_trylock_fail(bp, _RET_IP_); 110662306a36Sopenharmony_ci return locked; 110762306a36Sopenharmony_ci} 110862306a36Sopenharmony_ci 110962306a36Sopenharmony_ci/* 111062306a36Sopenharmony_ci * Lock a buffer object. 111162306a36Sopenharmony_ci * 111262306a36Sopenharmony_ci * If we come across a stale, pinned, locked buffer, we know that we 111362306a36Sopenharmony_ci * are being asked to lock a buffer that has been reallocated. Because 111462306a36Sopenharmony_ci * it is pinned, we know that the log has not been pushed to disk and 111562306a36Sopenharmony_ci * hence it will still be locked. Rather than sleeping until someone 111662306a36Sopenharmony_ci * else pushes the log, push it ourselves before trying to get the lock. 111762306a36Sopenharmony_ci */ 111862306a36Sopenharmony_civoid 111962306a36Sopenharmony_cixfs_buf_lock( 112062306a36Sopenharmony_ci struct xfs_buf *bp) 112162306a36Sopenharmony_ci{ 112262306a36Sopenharmony_ci trace_xfs_buf_lock(bp, _RET_IP_); 112362306a36Sopenharmony_ci 112462306a36Sopenharmony_ci if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 112562306a36Sopenharmony_ci xfs_log_force(bp->b_mount, 0); 112662306a36Sopenharmony_ci down(&bp->b_sema); 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci trace_xfs_buf_lock_done(bp, _RET_IP_); 112962306a36Sopenharmony_ci} 113062306a36Sopenharmony_ci 113162306a36Sopenharmony_civoid 113262306a36Sopenharmony_cixfs_buf_unlock( 113362306a36Sopenharmony_ci struct xfs_buf *bp) 113462306a36Sopenharmony_ci{ 113562306a36Sopenharmony_ci ASSERT(xfs_buf_islocked(bp)); 113662306a36Sopenharmony_ci 113762306a36Sopenharmony_ci up(&bp->b_sema); 113862306a36Sopenharmony_ci trace_xfs_buf_unlock(bp, _RET_IP_); 113962306a36Sopenharmony_ci} 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ciSTATIC void 114262306a36Sopenharmony_cixfs_buf_wait_unpin( 114362306a36Sopenharmony_ci struct xfs_buf *bp) 114462306a36Sopenharmony_ci{ 114562306a36Sopenharmony_ci DECLARE_WAITQUEUE (wait, current); 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci if (atomic_read(&bp->b_pin_count) == 0) 114862306a36Sopenharmony_ci return; 114962306a36Sopenharmony_ci 115062306a36Sopenharmony_ci add_wait_queue(&bp->b_waiters, &wait); 115162306a36Sopenharmony_ci for (;;) { 115262306a36Sopenharmony_ci set_current_state(TASK_UNINTERRUPTIBLE); 115362306a36Sopenharmony_ci if (atomic_read(&bp->b_pin_count) == 0) 115462306a36Sopenharmony_ci break; 115562306a36Sopenharmony_ci io_schedule(); 115662306a36Sopenharmony_ci } 115762306a36Sopenharmony_ci remove_wait_queue(&bp->b_waiters, &wait); 115862306a36Sopenharmony_ci set_current_state(TASK_RUNNING); 115962306a36Sopenharmony_ci} 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_cistatic void 116262306a36Sopenharmony_cixfs_buf_ioerror_alert_ratelimited( 116362306a36Sopenharmony_ci struct xfs_buf *bp) 116462306a36Sopenharmony_ci{ 116562306a36Sopenharmony_ci static unsigned long lasttime; 116662306a36Sopenharmony_ci static struct xfs_buftarg *lasttarg; 116762306a36Sopenharmony_ci 116862306a36Sopenharmony_ci if (bp->b_target != lasttarg || 116962306a36Sopenharmony_ci time_after(jiffies, (lasttime + 5*HZ))) { 117062306a36Sopenharmony_ci lasttime = jiffies; 117162306a36Sopenharmony_ci xfs_buf_ioerror_alert(bp, __this_address); 117262306a36Sopenharmony_ci } 117362306a36Sopenharmony_ci lasttarg = bp->b_target; 117462306a36Sopenharmony_ci} 117562306a36Sopenharmony_ci 117662306a36Sopenharmony_ci/* 117762306a36Sopenharmony_ci * Account for this latest trip around the retry handler, and decide if 117862306a36Sopenharmony_ci * we've failed enough times to constitute a permanent failure. 117962306a36Sopenharmony_ci */ 118062306a36Sopenharmony_cistatic bool 118162306a36Sopenharmony_cixfs_buf_ioerror_permanent( 118262306a36Sopenharmony_ci struct xfs_buf *bp, 118362306a36Sopenharmony_ci struct xfs_error_cfg *cfg) 118462306a36Sopenharmony_ci{ 118562306a36Sopenharmony_ci struct xfs_mount *mp = bp->b_mount; 118662306a36Sopenharmony_ci 118762306a36Sopenharmony_ci if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 118862306a36Sopenharmony_ci ++bp->b_retries > cfg->max_retries) 118962306a36Sopenharmony_ci return true; 119062306a36Sopenharmony_ci if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 119162306a36Sopenharmony_ci time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 119262306a36Sopenharmony_ci return true; 119362306a36Sopenharmony_ci 119462306a36Sopenharmony_ci /* At unmount we may treat errors differently */ 119562306a36Sopenharmony_ci if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 119662306a36Sopenharmony_ci return true; 119762306a36Sopenharmony_ci 119862306a36Sopenharmony_ci return false; 119962306a36Sopenharmony_ci} 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci/* 120262306a36Sopenharmony_ci * On a sync write or shutdown we just want to stale the buffer and let the 120362306a36Sopenharmony_ci * caller handle the error in bp->b_error appropriately. 120462306a36Sopenharmony_ci * 120562306a36Sopenharmony_ci * If the write was asynchronous then no one will be looking for the error. If 120662306a36Sopenharmony_ci * this is the first failure of this type, clear the error state and write the 120762306a36Sopenharmony_ci * buffer out again. This means we always retry an async write failure at least 120862306a36Sopenharmony_ci * once, but we also need to set the buffer up to behave correctly now for 120962306a36Sopenharmony_ci * repeated failures. 121062306a36Sopenharmony_ci * 121162306a36Sopenharmony_ci * If we get repeated async write failures, then we take action according to the 121262306a36Sopenharmony_ci * error configuration we have been set up to use. 121362306a36Sopenharmony_ci * 121462306a36Sopenharmony_ci * Returns true if this function took care of error handling and the caller must 121562306a36Sopenharmony_ci * not touch the buffer again. Return false if the caller should proceed with 121662306a36Sopenharmony_ci * normal I/O completion handling. 121762306a36Sopenharmony_ci */ 121862306a36Sopenharmony_cistatic bool 121962306a36Sopenharmony_cixfs_buf_ioend_handle_error( 122062306a36Sopenharmony_ci struct xfs_buf *bp) 122162306a36Sopenharmony_ci{ 122262306a36Sopenharmony_ci struct xfs_mount *mp = bp->b_mount; 122362306a36Sopenharmony_ci struct xfs_error_cfg *cfg; 122462306a36Sopenharmony_ci 122562306a36Sopenharmony_ci /* 122662306a36Sopenharmony_ci * If we've already shutdown the journal because of I/O errors, there's 122762306a36Sopenharmony_ci * no point in giving this a retry. 122862306a36Sopenharmony_ci */ 122962306a36Sopenharmony_ci if (xlog_is_shutdown(mp->m_log)) 123062306a36Sopenharmony_ci goto out_stale; 123162306a36Sopenharmony_ci 123262306a36Sopenharmony_ci xfs_buf_ioerror_alert_ratelimited(bp); 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_ci /* 123562306a36Sopenharmony_ci * We're not going to bother about retrying this during recovery. 123662306a36Sopenharmony_ci * One strike! 123762306a36Sopenharmony_ci */ 123862306a36Sopenharmony_ci if (bp->b_flags & _XBF_LOGRECOVERY) { 123962306a36Sopenharmony_ci xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 124062306a36Sopenharmony_ci return false; 124162306a36Sopenharmony_ci } 124262306a36Sopenharmony_ci 124362306a36Sopenharmony_ci /* 124462306a36Sopenharmony_ci * Synchronous writes will have callers process the error. 124562306a36Sopenharmony_ci */ 124662306a36Sopenharmony_ci if (!(bp->b_flags & XBF_ASYNC)) 124762306a36Sopenharmony_ci goto out_stale; 124862306a36Sopenharmony_ci 124962306a36Sopenharmony_ci trace_xfs_buf_iodone_async(bp, _RET_IP_); 125062306a36Sopenharmony_ci 125162306a36Sopenharmony_ci cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 125262306a36Sopenharmony_ci if (bp->b_last_error != bp->b_error || 125362306a36Sopenharmony_ci !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 125462306a36Sopenharmony_ci bp->b_last_error = bp->b_error; 125562306a36Sopenharmony_ci if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 125662306a36Sopenharmony_ci !bp->b_first_retry_time) 125762306a36Sopenharmony_ci bp->b_first_retry_time = jiffies; 125862306a36Sopenharmony_ci goto resubmit; 125962306a36Sopenharmony_ci } 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ci /* 126262306a36Sopenharmony_ci * Permanent error - we need to trigger a shutdown if we haven't already 126362306a36Sopenharmony_ci * to indicate that inconsistency will result from this action. 126462306a36Sopenharmony_ci */ 126562306a36Sopenharmony_ci if (xfs_buf_ioerror_permanent(bp, cfg)) { 126662306a36Sopenharmony_ci xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 126762306a36Sopenharmony_ci goto out_stale; 126862306a36Sopenharmony_ci } 126962306a36Sopenharmony_ci 127062306a36Sopenharmony_ci /* Still considered a transient error. Caller will schedule retries. */ 127162306a36Sopenharmony_ci if (bp->b_flags & _XBF_INODES) 127262306a36Sopenharmony_ci xfs_buf_inode_io_fail(bp); 127362306a36Sopenharmony_ci else if (bp->b_flags & _XBF_DQUOTS) 127462306a36Sopenharmony_ci xfs_buf_dquot_io_fail(bp); 127562306a36Sopenharmony_ci else 127662306a36Sopenharmony_ci ASSERT(list_empty(&bp->b_li_list)); 127762306a36Sopenharmony_ci xfs_buf_ioerror(bp, 0); 127862306a36Sopenharmony_ci xfs_buf_relse(bp); 127962306a36Sopenharmony_ci return true; 128062306a36Sopenharmony_ci 128162306a36Sopenharmony_ciresubmit: 128262306a36Sopenharmony_ci xfs_buf_ioerror(bp, 0); 128362306a36Sopenharmony_ci bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 128462306a36Sopenharmony_ci xfs_buf_submit(bp); 128562306a36Sopenharmony_ci return true; 128662306a36Sopenharmony_ciout_stale: 128762306a36Sopenharmony_ci xfs_buf_stale(bp); 128862306a36Sopenharmony_ci bp->b_flags |= XBF_DONE; 128962306a36Sopenharmony_ci bp->b_flags &= ~XBF_WRITE; 129062306a36Sopenharmony_ci trace_xfs_buf_error_relse(bp, _RET_IP_); 129162306a36Sopenharmony_ci return false; 129262306a36Sopenharmony_ci} 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_cistatic void 129562306a36Sopenharmony_cixfs_buf_ioend( 129662306a36Sopenharmony_ci struct xfs_buf *bp) 129762306a36Sopenharmony_ci{ 129862306a36Sopenharmony_ci trace_xfs_buf_iodone(bp, _RET_IP_); 129962306a36Sopenharmony_ci 130062306a36Sopenharmony_ci /* 130162306a36Sopenharmony_ci * Pull in IO completion errors now. We are guaranteed to be running 130262306a36Sopenharmony_ci * single threaded, so we don't need the lock to read b_io_error. 130362306a36Sopenharmony_ci */ 130462306a36Sopenharmony_ci if (!bp->b_error && bp->b_io_error) 130562306a36Sopenharmony_ci xfs_buf_ioerror(bp, bp->b_io_error); 130662306a36Sopenharmony_ci 130762306a36Sopenharmony_ci if (bp->b_flags & XBF_READ) { 130862306a36Sopenharmony_ci if (!bp->b_error && bp->b_ops) 130962306a36Sopenharmony_ci bp->b_ops->verify_read(bp); 131062306a36Sopenharmony_ci if (!bp->b_error) 131162306a36Sopenharmony_ci bp->b_flags |= XBF_DONE; 131262306a36Sopenharmony_ci } else { 131362306a36Sopenharmony_ci if (!bp->b_error) { 131462306a36Sopenharmony_ci bp->b_flags &= ~XBF_WRITE_FAIL; 131562306a36Sopenharmony_ci bp->b_flags |= XBF_DONE; 131662306a36Sopenharmony_ci } 131762306a36Sopenharmony_ci 131862306a36Sopenharmony_ci if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 131962306a36Sopenharmony_ci return; 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci /* clear the retry state */ 132262306a36Sopenharmony_ci bp->b_last_error = 0; 132362306a36Sopenharmony_ci bp->b_retries = 0; 132462306a36Sopenharmony_ci bp->b_first_retry_time = 0; 132562306a36Sopenharmony_ci 132662306a36Sopenharmony_ci /* 132762306a36Sopenharmony_ci * Note that for things like remote attribute buffers, there may 132862306a36Sopenharmony_ci * not be a buffer log item here, so processing the buffer log 132962306a36Sopenharmony_ci * item must remain optional. 133062306a36Sopenharmony_ci */ 133162306a36Sopenharmony_ci if (bp->b_log_item) 133262306a36Sopenharmony_ci xfs_buf_item_done(bp); 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_ci if (bp->b_flags & _XBF_INODES) 133562306a36Sopenharmony_ci xfs_buf_inode_iodone(bp); 133662306a36Sopenharmony_ci else if (bp->b_flags & _XBF_DQUOTS) 133762306a36Sopenharmony_ci xfs_buf_dquot_iodone(bp); 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci } 134062306a36Sopenharmony_ci 134162306a36Sopenharmony_ci bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 134262306a36Sopenharmony_ci _XBF_LOGRECOVERY); 134362306a36Sopenharmony_ci 134462306a36Sopenharmony_ci if (bp->b_flags & XBF_ASYNC) 134562306a36Sopenharmony_ci xfs_buf_relse(bp); 134662306a36Sopenharmony_ci else 134762306a36Sopenharmony_ci complete(&bp->b_iowait); 134862306a36Sopenharmony_ci} 134962306a36Sopenharmony_ci 135062306a36Sopenharmony_cistatic void 135162306a36Sopenharmony_cixfs_buf_ioend_work( 135262306a36Sopenharmony_ci struct work_struct *work) 135362306a36Sopenharmony_ci{ 135462306a36Sopenharmony_ci struct xfs_buf *bp = 135562306a36Sopenharmony_ci container_of(work, struct xfs_buf, b_ioend_work); 135662306a36Sopenharmony_ci 135762306a36Sopenharmony_ci xfs_buf_ioend(bp); 135862306a36Sopenharmony_ci} 135962306a36Sopenharmony_ci 136062306a36Sopenharmony_cistatic void 136162306a36Sopenharmony_cixfs_buf_ioend_async( 136262306a36Sopenharmony_ci struct xfs_buf *bp) 136362306a36Sopenharmony_ci{ 136462306a36Sopenharmony_ci INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 136562306a36Sopenharmony_ci queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 136662306a36Sopenharmony_ci} 136762306a36Sopenharmony_ci 136862306a36Sopenharmony_civoid 136962306a36Sopenharmony_ci__xfs_buf_ioerror( 137062306a36Sopenharmony_ci struct xfs_buf *bp, 137162306a36Sopenharmony_ci int error, 137262306a36Sopenharmony_ci xfs_failaddr_t failaddr) 137362306a36Sopenharmony_ci{ 137462306a36Sopenharmony_ci ASSERT(error <= 0 && error >= -1000); 137562306a36Sopenharmony_ci bp->b_error = error; 137662306a36Sopenharmony_ci trace_xfs_buf_ioerror(bp, error, failaddr); 137762306a36Sopenharmony_ci} 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_civoid 138062306a36Sopenharmony_cixfs_buf_ioerror_alert( 138162306a36Sopenharmony_ci struct xfs_buf *bp, 138262306a36Sopenharmony_ci xfs_failaddr_t func) 138362306a36Sopenharmony_ci{ 138462306a36Sopenharmony_ci xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 138562306a36Sopenharmony_ci "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 138662306a36Sopenharmony_ci func, (uint64_t)xfs_buf_daddr(bp), 138762306a36Sopenharmony_ci bp->b_length, -bp->b_error); 138862306a36Sopenharmony_ci} 138962306a36Sopenharmony_ci 139062306a36Sopenharmony_ci/* 139162306a36Sopenharmony_ci * To simulate an I/O failure, the buffer must be locked and held with at least 139262306a36Sopenharmony_ci * three references. The LRU reference is dropped by the stale call. The buf 139362306a36Sopenharmony_ci * item reference is dropped via ioend processing. The third reference is owned 139462306a36Sopenharmony_ci * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 139562306a36Sopenharmony_ci */ 139662306a36Sopenharmony_civoid 139762306a36Sopenharmony_cixfs_buf_ioend_fail( 139862306a36Sopenharmony_ci struct xfs_buf *bp) 139962306a36Sopenharmony_ci{ 140062306a36Sopenharmony_ci bp->b_flags &= ~XBF_DONE; 140162306a36Sopenharmony_ci xfs_buf_stale(bp); 140262306a36Sopenharmony_ci xfs_buf_ioerror(bp, -EIO); 140362306a36Sopenharmony_ci xfs_buf_ioend(bp); 140462306a36Sopenharmony_ci} 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ciint 140762306a36Sopenharmony_cixfs_bwrite( 140862306a36Sopenharmony_ci struct xfs_buf *bp) 140962306a36Sopenharmony_ci{ 141062306a36Sopenharmony_ci int error; 141162306a36Sopenharmony_ci 141262306a36Sopenharmony_ci ASSERT(xfs_buf_islocked(bp)); 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_ci bp->b_flags |= XBF_WRITE; 141562306a36Sopenharmony_ci bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 141662306a36Sopenharmony_ci XBF_DONE); 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci error = xfs_buf_submit(bp); 141962306a36Sopenharmony_ci if (error) 142062306a36Sopenharmony_ci xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 142162306a36Sopenharmony_ci return error; 142262306a36Sopenharmony_ci} 142362306a36Sopenharmony_ci 142462306a36Sopenharmony_cistatic void 142562306a36Sopenharmony_cixfs_buf_bio_end_io( 142662306a36Sopenharmony_ci struct bio *bio) 142762306a36Sopenharmony_ci{ 142862306a36Sopenharmony_ci struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci if (!bio->bi_status && 143162306a36Sopenharmony_ci (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 143262306a36Sopenharmony_ci XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 143362306a36Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 143462306a36Sopenharmony_ci 143562306a36Sopenharmony_ci /* 143662306a36Sopenharmony_ci * don't overwrite existing errors - otherwise we can lose errors on 143762306a36Sopenharmony_ci * buffers that require multiple bios to complete. 143862306a36Sopenharmony_ci */ 143962306a36Sopenharmony_ci if (bio->bi_status) { 144062306a36Sopenharmony_ci int error = blk_status_to_errno(bio->bi_status); 144162306a36Sopenharmony_ci 144262306a36Sopenharmony_ci cmpxchg(&bp->b_io_error, 0, error); 144362306a36Sopenharmony_ci } 144462306a36Sopenharmony_ci 144562306a36Sopenharmony_ci if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 144662306a36Sopenharmony_ci invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 144762306a36Sopenharmony_ci 144862306a36Sopenharmony_ci if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 144962306a36Sopenharmony_ci xfs_buf_ioend_async(bp); 145062306a36Sopenharmony_ci bio_put(bio); 145162306a36Sopenharmony_ci} 145262306a36Sopenharmony_ci 145362306a36Sopenharmony_cistatic void 145462306a36Sopenharmony_cixfs_buf_ioapply_map( 145562306a36Sopenharmony_ci struct xfs_buf *bp, 145662306a36Sopenharmony_ci int map, 145762306a36Sopenharmony_ci int *buf_offset, 145862306a36Sopenharmony_ci int *count, 145962306a36Sopenharmony_ci blk_opf_t op) 146062306a36Sopenharmony_ci{ 146162306a36Sopenharmony_ci int page_index; 146262306a36Sopenharmony_ci unsigned int total_nr_pages = bp->b_page_count; 146362306a36Sopenharmony_ci int nr_pages; 146462306a36Sopenharmony_ci struct bio *bio; 146562306a36Sopenharmony_ci sector_t sector = bp->b_maps[map].bm_bn; 146662306a36Sopenharmony_ci int size; 146762306a36Sopenharmony_ci int offset; 146862306a36Sopenharmony_ci 146962306a36Sopenharmony_ci /* skip the pages in the buffer before the start offset */ 147062306a36Sopenharmony_ci page_index = 0; 147162306a36Sopenharmony_ci offset = *buf_offset; 147262306a36Sopenharmony_ci while (offset >= PAGE_SIZE) { 147362306a36Sopenharmony_ci page_index++; 147462306a36Sopenharmony_ci offset -= PAGE_SIZE; 147562306a36Sopenharmony_ci } 147662306a36Sopenharmony_ci 147762306a36Sopenharmony_ci /* 147862306a36Sopenharmony_ci * Limit the IO size to the length of the current vector, and update the 147962306a36Sopenharmony_ci * remaining IO count for the next time around. 148062306a36Sopenharmony_ci */ 148162306a36Sopenharmony_ci size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 148262306a36Sopenharmony_ci *count -= size; 148362306a36Sopenharmony_ci *buf_offset += size; 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_cinext_chunk: 148662306a36Sopenharmony_ci atomic_inc(&bp->b_io_remaining); 148762306a36Sopenharmony_ci nr_pages = bio_max_segs(total_nr_pages); 148862306a36Sopenharmony_ci 148962306a36Sopenharmony_ci bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); 149062306a36Sopenharmony_ci bio->bi_iter.bi_sector = sector; 149162306a36Sopenharmony_ci bio->bi_end_io = xfs_buf_bio_end_io; 149262306a36Sopenharmony_ci bio->bi_private = bp; 149362306a36Sopenharmony_ci 149462306a36Sopenharmony_ci for (; size && nr_pages; nr_pages--, page_index++) { 149562306a36Sopenharmony_ci int rbytes, nbytes = PAGE_SIZE - offset; 149662306a36Sopenharmony_ci 149762306a36Sopenharmony_ci if (nbytes > size) 149862306a36Sopenharmony_ci nbytes = size; 149962306a36Sopenharmony_ci 150062306a36Sopenharmony_ci rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 150162306a36Sopenharmony_ci offset); 150262306a36Sopenharmony_ci if (rbytes < nbytes) 150362306a36Sopenharmony_ci break; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci offset = 0; 150662306a36Sopenharmony_ci sector += BTOBB(nbytes); 150762306a36Sopenharmony_ci size -= nbytes; 150862306a36Sopenharmony_ci total_nr_pages--; 150962306a36Sopenharmony_ci } 151062306a36Sopenharmony_ci 151162306a36Sopenharmony_ci if (likely(bio->bi_iter.bi_size)) { 151262306a36Sopenharmony_ci if (xfs_buf_is_vmapped(bp)) { 151362306a36Sopenharmony_ci flush_kernel_vmap_range(bp->b_addr, 151462306a36Sopenharmony_ci xfs_buf_vmap_len(bp)); 151562306a36Sopenharmony_ci } 151662306a36Sopenharmony_ci submit_bio(bio); 151762306a36Sopenharmony_ci if (size) 151862306a36Sopenharmony_ci goto next_chunk; 151962306a36Sopenharmony_ci } else { 152062306a36Sopenharmony_ci /* 152162306a36Sopenharmony_ci * This is guaranteed not to be the last io reference count 152262306a36Sopenharmony_ci * because the caller (xfs_buf_submit) holds a count itself. 152362306a36Sopenharmony_ci */ 152462306a36Sopenharmony_ci atomic_dec(&bp->b_io_remaining); 152562306a36Sopenharmony_ci xfs_buf_ioerror(bp, -EIO); 152662306a36Sopenharmony_ci bio_put(bio); 152762306a36Sopenharmony_ci } 152862306a36Sopenharmony_ci 152962306a36Sopenharmony_ci} 153062306a36Sopenharmony_ci 153162306a36Sopenharmony_ciSTATIC void 153262306a36Sopenharmony_ci_xfs_buf_ioapply( 153362306a36Sopenharmony_ci struct xfs_buf *bp) 153462306a36Sopenharmony_ci{ 153562306a36Sopenharmony_ci struct blk_plug plug; 153662306a36Sopenharmony_ci blk_opf_t op; 153762306a36Sopenharmony_ci int offset; 153862306a36Sopenharmony_ci int size; 153962306a36Sopenharmony_ci int i; 154062306a36Sopenharmony_ci 154162306a36Sopenharmony_ci /* 154262306a36Sopenharmony_ci * Make sure we capture only current IO errors rather than stale errors 154362306a36Sopenharmony_ci * left over from previous use of the buffer (e.g. failed readahead). 154462306a36Sopenharmony_ci */ 154562306a36Sopenharmony_ci bp->b_error = 0; 154662306a36Sopenharmony_ci 154762306a36Sopenharmony_ci if (bp->b_flags & XBF_WRITE) { 154862306a36Sopenharmony_ci op = REQ_OP_WRITE; 154962306a36Sopenharmony_ci 155062306a36Sopenharmony_ci /* 155162306a36Sopenharmony_ci * Run the write verifier callback function if it exists. If 155262306a36Sopenharmony_ci * this function fails it will mark the buffer with an error and 155362306a36Sopenharmony_ci * the IO should not be dispatched. 155462306a36Sopenharmony_ci */ 155562306a36Sopenharmony_ci if (bp->b_ops) { 155662306a36Sopenharmony_ci bp->b_ops->verify_write(bp); 155762306a36Sopenharmony_ci if (bp->b_error) { 155862306a36Sopenharmony_ci xfs_force_shutdown(bp->b_mount, 155962306a36Sopenharmony_ci SHUTDOWN_CORRUPT_INCORE); 156062306a36Sopenharmony_ci return; 156162306a36Sopenharmony_ci } 156262306a36Sopenharmony_ci } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 156362306a36Sopenharmony_ci struct xfs_mount *mp = bp->b_mount; 156462306a36Sopenharmony_ci 156562306a36Sopenharmony_ci /* 156662306a36Sopenharmony_ci * non-crc filesystems don't attach verifiers during 156762306a36Sopenharmony_ci * log recovery, so don't warn for such filesystems. 156862306a36Sopenharmony_ci */ 156962306a36Sopenharmony_ci if (xfs_has_crc(mp)) { 157062306a36Sopenharmony_ci xfs_warn(mp, 157162306a36Sopenharmony_ci "%s: no buf ops on daddr 0x%llx len %d", 157262306a36Sopenharmony_ci __func__, xfs_buf_daddr(bp), 157362306a36Sopenharmony_ci bp->b_length); 157462306a36Sopenharmony_ci xfs_hex_dump(bp->b_addr, 157562306a36Sopenharmony_ci XFS_CORRUPTION_DUMP_LEN); 157662306a36Sopenharmony_ci dump_stack(); 157762306a36Sopenharmony_ci } 157862306a36Sopenharmony_ci } 157962306a36Sopenharmony_ci } else { 158062306a36Sopenharmony_ci op = REQ_OP_READ; 158162306a36Sopenharmony_ci if (bp->b_flags & XBF_READ_AHEAD) 158262306a36Sopenharmony_ci op |= REQ_RAHEAD; 158362306a36Sopenharmony_ci } 158462306a36Sopenharmony_ci 158562306a36Sopenharmony_ci /* we only use the buffer cache for meta-data */ 158662306a36Sopenharmony_ci op |= REQ_META; 158762306a36Sopenharmony_ci 158862306a36Sopenharmony_ci /* 158962306a36Sopenharmony_ci * Walk all the vectors issuing IO on them. Set up the initial offset 159062306a36Sopenharmony_ci * into the buffer and the desired IO size before we start - 159162306a36Sopenharmony_ci * _xfs_buf_ioapply_vec() will modify them appropriately for each 159262306a36Sopenharmony_ci * subsequent call. 159362306a36Sopenharmony_ci */ 159462306a36Sopenharmony_ci offset = bp->b_offset; 159562306a36Sopenharmony_ci size = BBTOB(bp->b_length); 159662306a36Sopenharmony_ci blk_start_plug(&plug); 159762306a36Sopenharmony_ci for (i = 0; i < bp->b_map_count; i++) { 159862306a36Sopenharmony_ci xfs_buf_ioapply_map(bp, i, &offset, &size, op); 159962306a36Sopenharmony_ci if (bp->b_error) 160062306a36Sopenharmony_ci break; 160162306a36Sopenharmony_ci if (size <= 0) 160262306a36Sopenharmony_ci break; /* all done */ 160362306a36Sopenharmony_ci } 160462306a36Sopenharmony_ci blk_finish_plug(&plug); 160562306a36Sopenharmony_ci} 160662306a36Sopenharmony_ci 160762306a36Sopenharmony_ci/* 160862306a36Sopenharmony_ci * Wait for I/O completion of a sync buffer and return the I/O error code. 160962306a36Sopenharmony_ci */ 161062306a36Sopenharmony_cistatic int 161162306a36Sopenharmony_cixfs_buf_iowait( 161262306a36Sopenharmony_ci struct xfs_buf *bp) 161362306a36Sopenharmony_ci{ 161462306a36Sopenharmony_ci ASSERT(!(bp->b_flags & XBF_ASYNC)); 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci trace_xfs_buf_iowait(bp, _RET_IP_); 161762306a36Sopenharmony_ci wait_for_completion(&bp->b_iowait); 161862306a36Sopenharmony_ci trace_xfs_buf_iowait_done(bp, _RET_IP_); 161962306a36Sopenharmony_ci 162062306a36Sopenharmony_ci return bp->b_error; 162162306a36Sopenharmony_ci} 162262306a36Sopenharmony_ci 162362306a36Sopenharmony_ci/* 162462306a36Sopenharmony_ci * Buffer I/O submission path, read or write. Asynchronous submission transfers 162562306a36Sopenharmony_ci * the buffer lock ownership and the current reference to the IO. It is not 162662306a36Sopenharmony_ci * safe to reference the buffer after a call to this function unless the caller 162762306a36Sopenharmony_ci * holds an additional reference itself. 162862306a36Sopenharmony_ci */ 162962306a36Sopenharmony_cistatic int 163062306a36Sopenharmony_ci__xfs_buf_submit( 163162306a36Sopenharmony_ci struct xfs_buf *bp, 163262306a36Sopenharmony_ci bool wait) 163362306a36Sopenharmony_ci{ 163462306a36Sopenharmony_ci int error = 0; 163562306a36Sopenharmony_ci 163662306a36Sopenharmony_ci trace_xfs_buf_submit(bp, _RET_IP_); 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_ci ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci /* 164162306a36Sopenharmony_ci * On log shutdown we stale and complete the buffer immediately. We can 164262306a36Sopenharmony_ci * be called to read the superblock before the log has been set up, so 164362306a36Sopenharmony_ci * be careful checking the log state. 164462306a36Sopenharmony_ci * 164562306a36Sopenharmony_ci * Checking the mount shutdown state here can result in the log tail 164662306a36Sopenharmony_ci * moving inappropriately on disk as the log may not yet be shut down. 164762306a36Sopenharmony_ci * i.e. failing this buffer on mount shutdown can remove it from the AIL 164862306a36Sopenharmony_ci * and move the tail of the log forwards without having written this 164962306a36Sopenharmony_ci * buffer to disk. This corrupts the log tail state in memory, and 165062306a36Sopenharmony_ci * because the log may not be shut down yet, it can then be propagated 165162306a36Sopenharmony_ci * to disk before the log is shutdown. Hence we check log shutdown 165262306a36Sopenharmony_ci * state here rather than mount state to avoid corrupting the log tail 165362306a36Sopenharmony_ci * on shutdown. 165462306a36Sopenharmony_ci */ 165562306a36Sopenharmony_ci if (bp->b_mount->m_log && 165662306a36Sopenharmony_ci xlog_is_shutdown(bp->b_mount->m_log)) { 165762306a36Sopenharmony_ci xfs_buf_ioend_fail(bp); 165862306a36Sopenharmony_ci return -EIO; 165962306a36Sopenharmony_ci } 166062306a36Sopenharmony_ci 166162306a36Sopenharmony_ci /* 166262306a36Sopenharmony_ci * Grab a reference so the buffer does not go away underneath us. For 166362306a36Sopenharmony_ci * async buffers, I/O completion drops the callers reference, which 166462306a36Sopenharmony_ci * could occur before submission returns. 166562306a36Sopenharmony_ci */ 166662306a36Sopenharmony_ci xfs_buf_hold(bp); 166762306a36Sopenharmony_ci 166862306a36Sopenharmony_ci if (bp->b_flags & XBF_WRITE) 166962306a36Sopenharmony_ci xfs_buf_wait_unpin(bp); 167062306a36Sopenharmony_ci 167162306a36Sopenharmony_ci /* clear the internal error state to avoid spurious errors */ 167262306a36Sopenharmony_ci bp->b_io_error = 0; 167362306a36Sopenharmony_ci 167462306a36Sopenharmony_ci /* 167562306a36Sopenharmony_ci * Set the count to 1 initially, this will stop an I/O completion 167662306a36Sopenharmony_ci * callout which happens before we have started all the I/O from calling 167762306a36Sopenharmony_ci * xfs_buf_ioend too early. 167862306a36Sopenharmony_ci */ 167962306a36Sopenharmony_ci atomic_set(&bp->b_io_remaining, 1); 168062306a36Sopenharmony_ci if (bp->b_flags & XBF_ASYNC) 168162306a36Sopenharmony_ci xfs_buf_ioacct_inc(bp); 168262306a36Sopenharmony_ci _xfs_buf_ioapply(bp); 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci /* 168562306a36Sopenharmony_ci * If _xfs_buf_ioapply failed, we can get back here with only the IO 168662306a36Sopenharmony_ci * reference we took above. If we drop it to zero, run completion so 168762306a36Sopenharmony_ci * that we don't return to the caller with completion still pending. 168862306a36Sopenharmony_ci */ 168962306a36Sopenharmony_ci if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 169062306a36Sopenharmony_ci if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 169162306a36Sopenharmony_ci xfs_buf_ioend(bp); 169262306a36Sopenharmony_ci else 169362306a36Sopenharmony_ci xfs_buf_ioend_async(bp); 169462306a36Sopenharmony_ci } 169562306a36Sopenharmony_ci 169662306a36Sopenharmony_ci if (wait) 169762306a36Sopenharmony_ci error = xfs_buf_iowait(bp); 169862306a36Sopenharmony_ci 169962306a36Sopenharmony_ci /* 170062306a36Sopenharmony_ci * Release the hold that keeps the buffer referenced for the entire 170162306a36Sopenharmony_ci * I/O. Note that if the buffer is async, it is not safe to reference 170262306a36Sopenharmony_ci * after this release. 170362306a36Sopenharmony_ci */ 170462306a36Sopenharmony_ci xfs_buf_rele(bp); 170562306a36Sopenharmony_ci return error; 170662306a36Sopenharmony_ci} 170762306a36Sopenharmony_ci 170862306a36Sopenharmony_civoid * 170962306a36Sopenharmony_cixfs_buf_offset( 171062306a36Sopenharmony_ci struct xfs_buf *bp, 171162306a36Sopenharmony_ci size_t offset) 171262306a36Sopenharmony_ci{ 171362306a36Sopenharmony_ci struct page *page; 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci if (bp->b_addr) 171662306a36Sopenharmony_ci return bp->b_addr + offset; 171762306a36Sopenharmony_ci 171862306a36Sopenharmony_ci page = bp->b_pages[offset >> PAGE_SHIFT]; 171962306a36Sopenharmony_ci return page_address(page) + (offset & (PAGE_SIZE-1)); 172062306a36Sopenharmony_ci} 172162306a36Sopenharmony_ci 172262306a36Sopenharmony_civoid 172362306a36Sopenharmony_cixfs_buf_zero( 172462306a36Sopenharmony_ci struct xfs_buf *bp, 172562306a36Sopenharmony_ci size_t boff, 172662306a36Sopenharmony_ci size_t bsize) 172762306a36Sopenharmony_ci{ 172862306a36Sopenharmony_ci size_t bend; 172962306a36Sopenharmony_ci 173062306a36Sopenharmony_ci bend = boff + bsize; 173162306a36Sopenharmony_ci while (boff < bend) { 173262306a36Sopenharmony_ci struct page *page; 173362306a36Sopenharmony_ci int page_index, page_offset, csize; 173462306a36Sopenharmony_ci 173562306a36Sopenharmony_ci page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 173662306a36Sopenharmony_ci page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 173762306a36Sopenharmony_ci page = bp->b_pages[page_index]; 173862306a36Sopenharmony_ci csize = min_t(size_t, PAGE_SIZE - page_offset, 173962306a36Sopenharmony_ci BBTOB(bp->b_length) - boff); 174062306a36Sopenharmony_ci 174162306a36Sopenharmony_ci ASSERT((csize + page_offset) <= PAGE_SIZE); 174262306a36Sopenharmony_ci 174362306a36Sopenharmony_ci memset(page_address(page) + page_offset, 0, csize); 174462306a36Sopenharmony_ci 174562306a36Sopenharmony_ci boff += csize; 174662306a36Sopenharmony_ci } 174762306a36Sopenharmony_ci} 174862306a36Sopenharmony_ci 174962306a36Sopenharmony_ci/* 175062306a36Sopenharmony_ci * Log a message about and stale a buffer that a caller has decided is corrupt. 175162306a36Sopenharmony_ci * 175262306a36Sopenharmony_ci * This function should be called for the kinds of metadata corruption that 175362306a36Sopenharmony_ci * cannot be detect from a verifier, such as incorrect inter-block relationship 175462306a36Sopenharmony_ci * data. Do /not/ call this function from a verifier function. 175562306a36Sopenharmony_ci * 175662306a36Sopenharmony_ci * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 175762306a36Sopenharmony_ci * be marked stale, but b_error will not be set. The caller is responsible for 175862306a36Sopenharmony_ci * releasing the buffer or fixing it. 175962306a36Sopenharmony_ci */ 176062306a36Sopenharmony_civoid 176162306a36Sopenharmony_ci__xfs_buf_mark_corrupt( 176262306a36Sopenharmony_ci struct xfs_buf *bp, 176362306a36Sopenharmony_ci xfs_failaddr_t fa) 176462306a36Sopenharmony_ci{ 176562306a36Sopenharmony_ci ASSERT(bp->b_flags & XBF_DONE); 176662306a36Sopenharmony_ci 176762306a36Sopenharmony_ci xfs_buf_corruption_error(bp, fa); 176862306a36Sopenharmony_ci xfs_buf_stale(bp); 176962306a36Sopenharmony_ci} 177062306a36Sopenharmony_ci 177162306a36Sopenharmony_ci/* 177262306a36Sopenharmony_ci * Handling of buffer targets (buftargs). 177362306a36Sopenharmony_ci */ 177462306a36Sopenharmony_ci 177562306a36Sopenharmony_ci/* 177662306a36Sopenharmony_ci * Wait for any bufs with callbacks that have been submitted but have not yet 177762306a36Sopenharmony_ci * returned. These buffers will have an elevated hold count, so wait on those 177862306a36Sopenharmony_ci * while freeing all the buffers only held by the LRU. 177962306a36Sopenharmony_ci */ 178062306a36Sopenharmony_cistatic enum lru_status 178162306a36Sopenharmony_cixfs_buftarg_drain_rele( 178262306a36Sopenharmony_ci struct list_head *item, 178362306a36Sopenharmony_ci struct list_lru_one *lru, 178462306a36Sopenharmony_ci spinlock_t *lru_lock, 178562306a36Sopenharmony_ci void *arg) 178662306a36Sopenharmony_ci 178762306a36Sopenharmony_ci{ 178862306a36Sopenharmony_ci struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 178962306a36Sopenharmony_ci struct list_head *dispose = arg; 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci if (atomic_read(&bp->b_hold) > 1) { 179262306a36Sopenharmony_ci /* need to wait, so skip it this pass */ 179362306a36Sopenharmony_ci trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 179462306a36Sopenharmony_ci return LRU_SKIP; 179562306a36Sopenharmony_ci } 179662306a36Sopenharmony_ci if (!spin_trylock(&bp->b_lock)) 179762306a36Sopenharmony_ci return LRU_SKIP; 179862306a36Sopenharmony_ci 179962306a36Sopenharmony_ci /* 180062306a36Sopenharmony_ci * clear the LRU reference count so the buffer doesn't get 180162306a36Sopenharmony_ci * ignored in xfs_buf_rele(). 180262306a36Sopenharmony_ci */ 180362306a36Sopenharmony_ci atomic_set(&bp->b_lru_ref, 0); 180462306a36Sopenharmony_ci bp->b_state |= XFS_BSTATE_DISPOSE; 180562306a36Sopenharmony_ci list_lru_isolate_move(lru, item, dispose); 180662306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 180762306a36Sopenharmony_ci return LRU_REMOVED; 180862306a36Sopenharmony_ci} 180962306a36Sopenharmony_ci 181062306a36Sopenharmony_ci/* 181162306a36Sopenharmony_ci * Wait for outstanding I/O on the buftarg to complete. 181262306a36Sopenharmony_ci */ 181362306a36Sopenharmony_civoid 181462306a36Sopenharmony_cixfs_buftarg_wait( 181562306a36Sopenharmony_ci struct xfs_buftarg *btp) 181662306a36Sopenharmony_ci{ 181762306a36Sopenharmony_ci /* 181862306a36Sopenharmony_ci * First wait on the buftarg I/O count for all in-flight buffers to be 181962306a36Sopenharmony_ci * released. This is critical as new buffers do not make the LRU until 182062306a36Sopenharmony_ci * they are released. 182162306a36Sopenharmony_ci * 182262306a36Sopenharmony_ci * Next, flush the buffer workqueue to ensure all completion processing 182362306a36Sopenharmony_ci * has finished. Just waiting on buffer locks is not sufficient for 182462306a36Sopenharmony_ci * async IO as the reference count held over IO is not released until 182562306a36Sopenharmony_ci * after the buffer lock is dropped. Hence we need to ensure here that 182662306a36Sopenharmony_ci * all reference counts have been dropped before we start walking the 182762306a36Sopenharmony_ci * LRU list. 182862306a36Sopenharmony_ci */ 182962306a36Sopenharmony_ci while (percpu_counter_sum(&btp->bt_io_count)) 183062306a36Sopenharmony_ci delay(100); 183162306a36Sopenharmony_ci flush_workqueue(btp->bt_mount->m_buf_workqueue); 183262306a36Sopenharmony_ci} 183362306a36Sopenharmony_ci 183462306a36Sopenharmony_civoid 183562306a36Sopenharmony_cixfs_buftarg_drain( 183662306a36Sopenharmony_ci struct xfs_buftarg *btp) 183762306a36Sopenharmony_ci{ 183862306a36Sopenharmony_ci LIST_HEAD(dispose); 183962306a36Sopenharmony_ci int loop = 0; 184062306a36Sopenharmony_ci bool write_fail = false; 184162306a36Sopenharmony_ci 184262306a36Sopenharmony_ci xfs_buftarg_wait(btp); 184362306a36Sopenharmony_ci 184462306a36Sopenharmony_ci /* loop until there is nothing left on the lru list. */ 184562306a36Sopenharmony_ci while (list_lru_count(&btp->bt_lru)) { 184662306a36Sopenharmony_ci list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 184762306a36Sopenharmony_ci &dispose, LONG_MAX); 184862306a36Sopenharmony_ci 184962306a36Sopenharmony_ci while (!list_empty(&dispose)) { 185062306a36Sopenharmony_ci struct xfs_buf *bp; 185162306a36Sopenharmony_ci bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 185262306a36Sopenharmony_ci list_del_init(&bp->b_lru); 185362306a36Sopenharmony_ci if (bp->b_flags & XBF_WRITE_FAIL) { 185462306a36Sopenharmony_ci write_fail = true; 185562306a36Sopenharmony_ci xfs_buf_alert_ratelimited(bp, 185662306a36Sopenharmony_ci "XFS: Corruption Alert", 185762306a36Sopenharmony_ci"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 185862306a36Sopenharmony_ci (long long)xfs_buf_daddr(bp)); 185962306a36Sopenharmony_ci } 186062306a36Sopenharmony_ci xfs_buf_rele(bp); 186162306a36Sopenharmony_ci } 186262306a36Sopenharmony_ci if (loop++ != 0) 186362306a36Sopenharmony_ci delay(100); 186462306a36Sopenharmony_ci } 186562306a36Sopenharmony_ci 186662306a36Sopenharmony_ci /* 186762306a36Sopenharmony_ci * If one or more failed buffers were freed, that means dirty metadata 186862306a36Sopenharmony_ci * was thrown away. This should only ever happen after I/O completion 186962306a36Sopenharmony_ci * handling has elevated I/O error(s) to permanent failures and shuts 187062306a36Sopenharmony_ci * down the journal. 187162306a36Sopenharmony_ci */ 187262306a36Sopenharmony_ci if (write_fail) { 187362306a36Sopenharmony_ci ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 187462306a36Sopenharmony_ci xfs_alert(btp->bt_mount, 187562306a36Sopenharmony_ci "Please run xfs_repair to determine the extent of the problem."); 187662306a36Sopenharmony_ci } 187762306a36Sopenharmony_ci} 187862306a36Sopenharmony_ci 187962306a36Sopenharmony_cistatic enum lru_status 188062306a36Sopenharmony_cixfs_buftarg_isolate( 188162306a36Sopenharmony_ci struct list_head *item, 188262306a36Sopenharmony_ci struct list_lru_one *lru, 188362306a36Sopenharmony_ci spinlock_t *lru_lock, 188462306a36Sopenharmony_ci void *arg) 188562306a36Sopenharmony_ci{ 188662306a36Sopenharmony_ci struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 188762306a36Sopenharmony_ci struct list_head *dispose = arg; 188862306a36Sopenharmony_ci 188962306a36Sopenharmony_ci /* 189062306a36Sopenharmony_ci * we are inverting the lru lock/bp->b_lock here, so use a trylock. 189162306a36Sopenharmony_ci * If we fail to get the lock, just skip it. 189262306a36Sopenharmony_ci */ 189362306a36Sopenharmony_ci if (!spin_trylock(&bp->b_lock)) 189462306a36Sopenharmony_ci return LRU_SKIP; 189562306a36Sopenharmony_ci /* 189662306a36Sopenharmony_ci * Decrement the b_lru_ref count unless the value is already 189762306a36Sopenharmony_ci * zero. If the value is already zero, we need to reclaim the 189862306a36Sopenharmony_ci * buffer, otherwise it gets another trip through the LRU. 189962306a36Sopenharmony_ci */ 190062306a36Sopenharmony_ci if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 190162306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 190262306a36Sopenharmony_ci return LRU_ROTATE; 190362306a36Sopenharmony_ci } 190462306a36Sopenharmony_ci 190562306a36Sopenharmony_ci bp->b_state |= XFS_BSTATE_DISPOSE; 190662306a36Sopenharmony_ci list_lru_isolate_move(lru, item, dispose); 190762306a36Sopenharmony_ci spin_unlock(&bp->b_lock); 190862306a36Sopenharmony_ci return LRU_REMOVED; 190962306a36Sopenharmony_ci} 191062306a36Sopenharmony_ci 191162306a36Sopenharmony_cistatic unsigned long 191262306a36Sopenharmony_cixfs_buftarg_shrink_scan( 191362306a36Sopenharmony_ci struct shrinker *shrink, 191462306a36Sopenharmony_ci struct shrink_control *sc) 191562306a36Sopenharmony_ci{ 191662306a36Sopenharmony_ci struct xfs_buftarg *btp = container_of(shrink, 191762306a36Sopenharmony_ci struct xfs_buftarg, bt_shrinker); 191862306a36Sopenharmony_ci LIST_HEAD(dispose); 191962306a36Sopenharmony_ci unsigned long freed; 192062306a36Sopenharmony_ci 192162306a36Sopenharmony_ci freed = list_lru_shrink_walk(&btp->bt_lru, sc, 192262306a36Sopenharmony_ci xfs_buftarg_isolate, &dispose); 192362306a36Sopenharmony_ci 192462306a36Sopenharmony_ci while (!list_empty(&dispose)) { 192562306a36Sopenharmony_ci struct xfs_buf *bp; 192662306a36Sopenharmony_ci bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 192762306a36Sopenharmony_ci list_del_init(&bp->b_lru); 192862306a36Sopenharmony_ci xfs_buf_rele(bp); 192962306a36Sopenharmony_ci } 193062306a36Sopenharmony_ci 193162306a36Sopenharmony_ci return freed; 193262306a36Sopenharmony_ci} 193362306a36Sopenharmony_ci 193462306a36Sopenharmony_cistatic unsigned long 193562306a36Sopenharmony_cixfs_buftarg_shrink_count( 193662306a36Sopenharmony_ci struct shrinker *shrink, 193762306a36Sopenharmony_ci struct shrink_control *sc) 193862306a36Sopenharmony_ci{ 193962306a36Sopenharmony_ci struct xfs_buftarg *btp = container_of(shrink, 194062306a36Sopenharmony_ci struct xfs_buftarg, bt_shrinker); 194162306a36Sopenharmony_ci return list_lru_shrink_count(&btp->bt_lru, sc); 194262306a36Sopenharmony_ci} 194362306a36Sopenharmony_ci 194462306a36Sopenharmony_civoid 194562306a36Sopenharmony_cixfs_free_buftarg( 194662306a36Sopenharmony_ci struct xfs_buftarg *btp) 194762306a36Sopenharmony_ci{ 194862306a36Sopenharmony_ci struct block_device *bdev = btp->bt_bdev; 194962306a36Sopenharmony_ci 195062306a36Sopenharmony_ci unregister_shrinker(&btp->bt_shrinker); 195162306a36Sopenharmony_ci ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 195262306a36Sopenharmony_ci percpu_counter_destroy(&btp->bt_io_count); 195362306a36Sopenharmony_ci list_lru_destroy(&btp->bt_lru); 195462306a36Sopenharmony_ci 195562306a36Sopenharmony_ci fs_put_dax(btp->bt_daxdev, btp->bt_mount); 195662306a36Sopenharmony_ci /* the main block device is closed by kill_block_super */ 195762306a36Sopenharmony_ci if (bdev != btp->bt_mount->m_super->s_bdev) 195862306a36Sopenharmony_ci blkdev_put(bdev, btp->bt_mount->m_super); 195962306a36Sopenharmony_ci 196062306a36Sopenharmony_ci kmem_free(btp); 196162306a36Sopenharmony_ci} 196262306a36Sopenharmony_ci 196362306a36Sopenharmony_ciint 196462306a36Sopenharmony_cixfs_setsize_buftarg( 196562306a36Sopenharmony_ci xfs_buftarg_t *btp, 196662306a36Sopenharmony_ci unsigned int sectorsize) 196762306a36Sopenharmony_ci{ 196862306a36Sopenharmony_ci /* Set up metadata sector size info */ 196962306a36Sopenharmony_ci btp->bt_meta_sectorsize = sectorsize; 197062306a36Sopenharmony_ci btp->bt_meta_sectormask = sectorsize - 1; 197162306a36Sopenharmony_ci 197262306a36Sopenharmony_ci if (set_blocksize(btp->bt_bdev, sectorsize)) { 197362306a36Sopenharmony_ci xfs_warn(btp->bt_mount, 197462306a36Sopenharmony_ci "Cannot set_blocksize to %u on device %pg", 197562306a36Sopenharmony_ci sectorsize, btp->bt_bdev); 197662306a36Sopenharmony_ci return -EINVAL; 197762306a36Sopenharmony_ci } 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci /* Set up device logical sector size mask */ 198062306a36Sopenharmony_ci btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 198162306a36Sopenharmony_ci btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci return 0; 198462306a36Sopenharmony_ci} 198562306a36Sopenharmony_ci 198662306a36Sopenharmony_ci/* 198762306a36Sopenharmony_ci * When allocating the initial buffer target we have not yet 198862306a36Sopenharmony_ci * read in the superblock, so don't know what sized sectors 198962306a36Sopenharmony_ci * are being used at this early stage. Play safe. 199062306a36Sopenharmony_ci */ 199162306a36Sopenharmony_ciSTATIC int 199262306a36Sopenharmony_cixfs_setsize_buftarg_early( 199362306a36Sopenharmony_ci xfs_buftarg_t *btp, 199462306a36Sopenharmony_ci struct block_device *bdev) 199562306a36Sopenharmony_ci{ 199662306a36Sopenharmony_ci return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 199762306a36Sopenharmony_ci} 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_cistruct xfs_buftarg * 200062306a36Sopenharmony_cixfs_alloc_buftarg( 200162306a36Sopenharmony_ci struct xfs_mount *mp, 200262306a36Sopenharmony_ci struct block_device *bdev) 200362306a36Sopenharmony_ci{ 200462306a36Sopenharmony_ci xfs_buftarg_t *btp; 200562306a36Sopenharmony_ci const struct dax_holder_operations *ops = NULL; 200662306a36Sopenharmony_ci 200762306a36Sopenharmony_ci#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 200862306a36Sopenharmony_ci ops = &xfs_dax_holder_operations; 200962306a36Sopenharmony_ci#endif 201062306a36Sopenharmony_ci btp = kmem_zalloc(sizeof(*btp), KM_NOFS); 201162306a36Sopenharmony_ci 201262306a36Sopenharmony_ci btp->bt_mount = mp; 201362306a36Sopenharmony_ci btp->bt_dev = bdev->bd_dev; 201462306a36Sopenharmony_ci btp->bt_bdev = bdev; 201562306a36Sopenharmony_ci btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, 201662306a36Sopenharmony_ci mp, ops); 201762306a36Sopenharmony_ci 201862306a36Sopenharmony_ci /* 201962306a36Sopenharmony_ci * Buffer IO error rate limiting. Limit it to no more than 10 messages 202062306a36Sopenharmony_ci * per 30 seconds so as to not spam logs too much on repeated errors. 202162306a36Sopenharmony_ci */ 202262306a36Sopenharmony_ci ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 202362306a36Sopenharmony_ci DEFAULT_RATELIMIT_BURST); 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_ci if (xfs_setsize_buftarg_early(btp, bdev)) 202662306a36Sopenharmony_ci goto error_free; 202762306a36Sopenharmony_ci 202862306a36Sopenharmony_ci if (list_lru_init(&btp->bt_lru)) 202962306a36Sopenharmony_ci goto error_free; 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_ci if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 203262306a36Sopenharmony_ci goto error_lru; 203362306a36Sopenharmony_ci 203462306a36Sopenharmony_ci btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 203562306a36Sopenharmony_ci btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 203662306a36Sopenharmony_ci btp->bt_shrinker.seeks = DEFAULT_SEEKS; 203762306a36Sopenharmony_ci btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; 203862306a36Sopenharmony_ci if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", 203962306a36Sopenharmony_ci mp->m_super->s_id)) 204062306a36Sopenharmony_ci goto error_pcpu; 204162306a36Sopenharmony_ci return btp; 204262306a36Sopenharmony_ci 204362306a36Sopenharmony_cierror_pcpu: 204462306a36Sopenharmony_ci percpu_counter_destroy(&btp->bt_io_count); 204562306a36Sopenharmony_cierror_lru: 204662306a36Sopenharmony_ci list_lru_destroy(&btp->bt_lru); 204762306a36Sopenharmony_cierror_free: 204862306a36Sopenharmony_ci kmem_free(btp); 204962306a36Sopenharmony_ci return NULL; 205062306a36Sopenharmony_ci} 205162306a36Sopenharmony_ci 205262306a36Sopenharmony_ci/* 205362306a36Sopenharmony_ci * Cancel a delayed write list. 205462306a36Sopenharmony_ci * 205562306a36Sopenharmony_ci * Remove each buffer from the list, clear the delwri queue flag and drop the 205662306a36Sopenharmony_ci * associated buffer reference. 205762306a36Sopenharmony_ci */ 205862306a36Sopenharmony_civoid 205962306a36Sopenharmony_cixfs_buf_delwri_cancel( 206062306a36Sopenharmony_ci struct list_head *list) 206162306a36Sopenharmony_ci{ 206262306a36Sopenharmony_ci struct xfs_buf *bp; 206362306a36Sopenharmony_ci 206462306a36Sopenharmony_ci while (!list_empty(list)) { 206562306a36Sopenharmony_ci bp = list_first_entry(list, struct xfs_buf, b_list); 206662306a36Sopenharmony_ci 206762306a36Sopenharmony_ci xfs_buf_lock(bp); 206862306a36Sopenharmony_ci bp->b_flags &= ~_XBF_DELWRI_Q; 206962306a36Sopenharmony_ci list_del_init(&bp->b_list); 207062306a36Sopenharmony_ci xfs_buf_relse(bp); 207162306a36Sopenharmony_ci } 207262306a36Sopenharmony_ci} 207362306a36Sopenharmony_ci 207462306a36Sopenharmony_ci/* 207562306a36Sopenharmony_ci * Add a buffer to the delayed write list. 207662306a36Sopenharmony_ci * 207762306a36Sopenharmony_ci * This queues a buffer for writeout if it hasn't already been. Note that 207862306a36Sopenharmony_ci * neither this routine nor the buffer list submission functions perform 207962306a36Sopenharmony_ci * any internal synchronization. It is expected that the lists are thread-local 208062306a36Sopenharmony_ci * to the callers. 208162306a36Sopenharmony_ci * 208262306a36Sopenharmony_ci * Returns true if we queued up the buffer, or false if it already had 208362306a36Sopenharmony_ci * been on the buffer list. 208462306a36Sopenharmony_ci */ 208562306a36Sopenharmony_cibool 208662306a36Sopenharmony_cixfs_buf_delwri_queue( 208762306a36Sopenharmony_ci struct xfs_buf *bp, 208862306a36Sopenharmony_ci struct list_head *list) 208962306a36Sopenharmony_ci{ 209062306a36Sopenharmony_ci ASSERT(xfs_buf_islocked(bp)); 209162306a36Sopenharmony_ci ASSERT(!(bp->b_flags & XBF_READ)); 209262306a36Sopenharmony_ci 209362306a36Sopenharmony_ci /* 209462306a36Sopenharmony_ci * If the buffer is already marked delwri it already is queued up 209562306a36Sopenharmony_ci * by someone else for imediate writeout. Just ignore it in that 209662306a36Sopenharmony_ci * case. 209762306a36Sopenharmony_ci */ 209862306a36Sopenharmony_ci if (bp->b_flags & _XBF_DELWRI_Q) { 209962306a36Sopenharmony_ci trace_xfs_buf_delwri_queued(bp, _RET_IP_); 210062306a36Sopenharmony_ci return false; 210162306a36Sopenharmony_ci } 210262306a36Sopenharmony_ci 210362306a36Sopenharmony_ci trace_xfs_buf_delwri_queue(bp, _RET_IP_); 210462306a36Sopenharmony_ci 210562306a36Sopenharmony_ci /* 210662306a36Sopenharmony_ci * If a buffer gets written out synchronously or marked stale while it 210762306a36Sopenharmony_ci * is on a delwri list we lazily remove it. To do this, the other party 210862306a36Sopenharmony_ci * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 210962306a36Sopenharmony_ci * It remains referenced and on the list. In a rare corner case it 211062306a36Sopenharmony_ci * might get readded to a delwri list after the synchronous writeout, in 211162306a36Sopenharmony_ci * which case we need just need to re-add the flag here. 211262306a36Sopenharmony_ci */ 211362306a36Sopenharmony_ci bp->b_flags |= _XBF_DELWRI_Q; 211462306a36Sopenharmony_ci if (list_empty(&bp->b_list)) { 211562306a36Sopenharmony_ci atomic_inc(&bp->b_hold); 211662306a36Sopenharmony_ci list_add_tail(&bp->b_list, list); 211762306a36Sopenharmony_ci } 211862306a36Sopenharmony_ci 211962306a36Sopenharmony_ci return true; 212062306a36Sopenharmony_ci} 212162306a36Sopenharmony_ci 212262306a36Sopenharmony_ci/* 212362306a36Sopenharmony_ci * Compare function is more complex than it needs to be because 212462306a36Sopenharmony_ci * the return value is only 32 bits and we are doing comparisons 212562306a36Sopenharmony_ci * on 64 bit values 212662306a36Sopenharmony_ci */ 212762306a36Sopenharmony_cistatic int 212862306a36Sopenharmony_cixfs_buf_cmp( 212962306a36Sopenharmony_ci void *priv, 213062306a36Sopenharmony_ci const struct list_head *a, 213162306a36Sopenharmony_ci const struct list_head *b) 213262306a36Sopenharmony_ci{ 213362306a36Sopenharmony_ci struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 213462306a36Sopenharmony_ci struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 213562306a36Sopenharmony_ci xfs_daddr_t diff; 213662306a36Sopenharmony_ci 213762306a36Sopenharmony_ci diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 213862306a36Sopenharmony_ci if (diff < 0) 213962306a36Sopenharmony_ci return -1; 214062306a36Sopenharmony_ci if (diff > 0) 214162306a36Sopenharmony_ci return 1; 214262306a36Sopenharmony_ci return 0; 214362306a36Sopenharmony_ci} 214462306a36Sopenharmony_ci 214562306a36Sopenharmony_ci/* 214662306a36Sopenharmony_ci * Submit buffers for write. If wait_list is specified, the buffers are 214762306a36Sopenharmony_ci * submitted using sync I/O and placed on the wait list such that the caller can 214862306a36Sopenharmony_ci * iowait each buffer. Otherwise async I/O is used and the buffers are released 214962306a36Sopenharmony_ci * at I/O completion time. In either case, buffers remain locked until I/O 215062306a36Sopenharmony_ci * completes and the buffer is released from the queue. 215162306a36Sopenharmony_ci */ 215262306a36Sopenharmony_cistatic int 215362306a36Sopenharmony_cixfs_buf_delwri_submit_buffers( 215462306a36Sopenharmony_ci struct list_head *buffer_list, 215562306a36Sopenharmony_ci struct list_head *wait_list) 215662306a36Sopenharmony_ci{ 215762306a36Sopenharmony_ci struct xfs_buf *bp, *n; 215862306a36Sopenharmony_ci int pinned = 0; 215962306a36Sopenharmony_ci struct blk_plug plug; 216062306a36Sopenharmony_ci 216162306a36Sopenharmony_ci list_sort(NULL, buffer_list, xfs_buf_cmp); 216262306a36Sopenharmony_ci 216362306a36Sopenharmony_ci blk_start_plug(&plug); 216462306a36Sopenharmony_ci list_for_each_entry_safe(bp, n, buffer_list, b_list) { 216562306a36Sopenharmony_ci if (!wait_list) { 216662306a36Sopenharmony_ci if (!xfs_buf_trylock(bp)) 216762306a36Sopenharmony_ci continue; 216862306a36Sopenharmony_ci if (xfs_buf_ispinned(bp)) { 216962306a36Sopenharmony_ci xfs_buf_unlock(bp); 217062306a36Sopenharmony_ci pinned++; 217162306a36Sopenharmony_ci continue; 217262306a36Sopenharmony_ci } 217362306a36Sopenharmony_ci } else { 217462306a36Sopenharmony_ci xfs_buf_lock(bp); 217562306a36Sopenharmony_ci } 217662306a36Sopenharmony_ci 217762306a36Sopenharmony_ci /* 217862306a36Sopenharmony_ci * Someone else might have written the buffer synchronously or 217962306a36Sopenharmony_ci * marked it stale in the meantime. In that case only the 218062306a36Sopenharmony_ci * _XBF_DELWRI_Q flag got cleared, and we have to drop the 218162306a36Sopenharmony_ci * reference and remove it from the list here. 218262306a36Sopenharmony_ci */ 218362306a36Sopenharmony_ci if (!(bp->b_flags & _XBF_DELWRI_Q)) { 218462306a36Sopenharmony_ci list_del_init(&bp->b_list); 218562306a36Sopenharmony_ci xfs_buf_relse(bp); 218662306a36Sopenharmony_ci continue; 218762306a36Sopenharmony_ci } 218862306a36Sopenharmony_ci 218962306a36Sopenharmony_ci trace_xfs_buf_delwri_split(bp, _RET_IP_); 219062306a36Sopenharmony_ci 219162306a36Sopenharmony_ci /* 219262306a36Sopenharmony_ci * If we have a wait list, each buffer (and associated delwri 219362306a36Sopenharmony_ci * queue reference) transfers to it and is submitted 219462306a36Sopenharmony_ci * synchronously. Otherwise, drop the buffer from the delwri 219562306a36Sopenharmony_ci * queue and submit async. 219662306a36Sopenharmony_ci */ 219762306a36Sopenharmony_ci bp->b_flags &= ~_XBF_DELWRI_Q; 219862306a36Sopenharmony_ci bp->b_flags |= XBF_WRITE; 219962306a36Sopenharmony_ci if (wait_list) { 220062306a36Sopenharmony_ci bp->b_flags &= ~XBF_ASYNC; 220162306a36Sopenharmony_ci list_move_tail(&bp->b_list, wait_list); 220262306a36Sopenharmony_ci } else { 220362306a36Sopenharmony_ci bp->b_flags |= XBF_ASYNC; 220462306a36Sopenharmony_ci list_del_init(&bp->b_list); 220562306a36Sopenharmony_ci } 220662306a36Sopenharmony_ci __xfs_buf_submit(bp, false); 220762306a36Sopenharmony_ci } 220862306a36Sopenharmony_ci blk_finish_plug(&plug); 220962306a36Sopenharmony_ci 221062306a36Sopenharmony_ci return pinned; 221162306a36Sopenharmony_ci} 221262306a36Sopenharmony_ci 221362306a36Sopenharmony_ci/* 221462306a36Sopenharmony_ci * Write out a buffer list asynchronously. 221562306a36Sopenharmony_ci * 221662306a36Sopenharmony_ci * This will take the @buffer_list, write all non-locked and non-pinned buffers 221762306a36Sopenharmony_ci * out and not wait for I/O completion on any of the buffers. This interface 221862306a36Sopenharmony_ci * is only safely useable for callers that can track I/O completion by higher 221962306a36Sopenharmony_ci * level means, e.g. AIL pushing as the @buffer_list is consumed in this 222062306a36Sopenharmony_ci * function. 222162306a36Sopenharmony_ci * 222262306a36Sopenharmony_ci * Note: this function will skip buffers it would block on, and in doing so 222362306a36Sopenharmony_ci * leaves them on @buffer_list so they can be retried on a later pass. As such, 222462306a36Sopenharmony_ci * it is up to the caller to ensure that the buffer list is fully submitted or 222562306a36Sopenharmony_ci * cancelled appropriately when they are finished with the list. Failure to 222662306a36Sopenharmony_ci * cancel or resubmit the list until it is empty will result in leaked buffers 222762306a36Sopenharmony_ci * at unmount time. 222862306a36Sopenharmony_ci */ 222962306a36Sopenharmony_ciint 223062306a36Sopenharmony_cixfs_buf_delwri_submit_nowait( 223162306a36Sopenharmony_ci struct list_head *buffer_list) 223262306a36Sopenharmony_ci{ 223362306a36Sopenharmony_ci return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 223462306a36Sopenharmony_ci} 223562306a36Sopenharmony_ci 223662306a36Sopenharmony_ci/* 223762306a36Sopenharmony_ci * Write out a buffer list synchronously. 223862306a36Sopenharmony_ci * 223962306a36Sopenharmony_ci * This will take the @buffer_list, write all buffers out and wait for I/O 224062306a36Sopenharmony_ci * completion on all of the buffers. @buffer_list is consumed by the function, 224162306a36Sopenharmony_ci * so callers must have some other way of tracking buffers if they require such 224262306a36Sopenharmony_ci * functionality. 224362306a36Sopenharmony_ci */ 224462306a36Sopenharmony_ciint 224562306a36Sopenharmony_cixfs_buf_delwri_submit( 224662306a36Sopenharmony_ci struct list_head *buffer_list) 224762306a36Sopenharmony_ci{ 224862306a36Sopenharmony_ci LIST_HEAD (wait_list); 224962306a36Sopenharmony_ci int error = 0, error2; 225062306a36Sopenharmony_ci struct xfs_buf *bp; 225162306a36Sopenharmony_ci 225262306a36Sopenharmony_ci xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 225362306a36Sopenharmony_ci 225462306a36Sopenharmony_ci /* Wait for IO to complete. */ 225562306a36Sopenharmony_ci while (!list_empty(&wait_list)) { 225662306a36Sopenharmony_ci bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ci list_del_init(&bp->b_list); 225962306a36Sopenharmony_ci 226062306a36Sopenharmony_ci /* 226162306a36Sopenharmony_ci * Wait on the locked buffer, check for errors and unlock and 226262306a36Sopenharmony_ci * release the delwri queue reference. 226362306a36Sopenharmony_ci */ 226462306a36Sopenharmony_ci error2 = xfs_buf_iowait(bp); 226562306a36Sopenharmony_ci xfs_buf_relse(bp); 226662306a36Sopenharmony_ci if (!error) 226762306a36Sopenharmony_ci error = error2; 226862306a36Sopenharmony_ci } 226962306a36Sopenharmony_ci 227062306a36Sopenharmony_ci return error; 227162306a36Sopenharmony_ci} 227262306a36Sopenharmony_ci 227362306a36Sopenharmony_ci/* 227462306a36Sopenharmony_ci * Push a single buffer on a delwri queue. 227562306a36Sopenharmony_ci * 227662306a36Sopenharmony_ci * The purpose of this function is to submit a single buffer of a delwri queue 227762306a36Sopenharmony_ci * and return with the buffer still on the original queue. The waiting delwri 227862306a36Sopenharmony_ci * buffer submission infrastructure guarantees transfer of the delwri queue 227962306a36Sopenharmony_ci * buffer reference to a temporary wait list. We reuse this infrastructure to 228062306a36Sopenharmony_ci * transfer the buffer back to the original queue. 228162306a36Sopenharmony_ci * 228262306a36Sopenharmony_ci * Note the buffer transitions from the queued state, to the submitted and wait 228362306a36Sopenharmony_ci * listed state and back to the queued state during this call. The buffer 228462306a36Sopenharmony_ci * locking and queue management logic between _delwri_pushbuf() and 228562306a36Sopenharmony_ci * _delwri_queue() guarantee that the buffer cannot be queued to another list 228662306a36Sopenharmony_ci * before returning. 228762306a36Sopenharmony_ci */ 228862306a36Sopenharmony_ciint 228962306a36Sopenharmony_cixfs_buf_delwri_pushbuf( 229062306a36Sopenharmony_ci struct xfs_buf *bp, 229162306a36Sopenharmony_ci struct list_head *buffer_list) 229262306a36Sopenharmony_ci{ 229362306a36Sopenharmony_ci LIST_HEAD (submit_list); 229462306a36Sopenharmony_ci int error; 229562306a36Sopenharmony_ci 229662306a36Sopenharmony_ci ASSERT(bp->b_flags & _XBF_DELWRI_Q); 229762306a36Sopenharmony_ci 229862306a36Sopenharmony_ci trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 229962306a36Sopenharmony_ci 230062306a36Sopenharmony_ci /* 230162306a36Sopenharmony_ci * Isolate the buffer to a new local list so we can submit it for I/O 230262306a36Sopenharmony_ci * independently from the rest of the original list. 230362306a36Sopenharmony_ci */ 230462306a36Sopenharmony_ci xfs_buf_lock(bp); 230562306a36Sopenharmony_ci list_move(&bp->b_list, &submit_list); 230662306a36Sopenharmony_ci xfs_buf_unlock(bp); 230762306a36Sopenharmony_ci 230862306a36Sopenharmony_ci /* 230962306a36Sopenharmony_ci * Delwri submission clears the DELWRI_Q buffer flag and returns with 231062306a36Sopenharmony_ci * the buffer on the wait list with the original reference. Rather than 231162306a36Sopenharmony_ci * bounce the buffer from a local wait list back to the original list 231262306a36Sopenharmony_ci * after I/O completion, reuse the original list as the wait list. 231362306a36Sopenharmony_ci */ 231462306a36Sopenharmony_ci xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 231562306a36Sopenharmony_ci 231662306a36Sopenharmony_ci /* 231762306a36Sopenharmony_ci * The buffer is now locked, under I/O and wait listed on the original 231862306a36Sopenharmony_ci * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 231962306a36Sopenharmony_ci * return with the buffer unlocked and on the original queue. 232062306a36Sopenharmony_ci */ 232162306a36Sopenharmony_ci error = xfs_buf_iowait(bp); 232262306a36Sopenharmony_ci bp->b_flags |= _XBF_DELWRI_Q; 232362306a36Sopenharmony_ci xfs_buf_unlock(bp); 232462306a36Sopenharmony_ci 232562306a36Sopenharmony_ci return error; 232662306a36Sopenharmony_ci} 232762306a36Sopenharmony_ci 232862306a36Sopenharmony_civoid xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 232962306a36Sopenharmony_ci{ 233062306a36Sopenharmony_ci /* 233162306a36Sopenharmony_ci * Set the lru reference count to 0 based on the error injection tag. 233262306a36Sopenharmony_ci * This allows userspace to disrupt buffer caching for debug/testing 233362306a36Sopenharmony_ci * purposes. 233462306a36Sopenharmony_ci */ 233562306a36Sopenharmony_ci if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 233662306a36Sopenharmony_ci lru_ref = 0; 233762306a36Sopenharmony_ci 233862306a36Sopenharmony_ci atomic_set(&bp->b_lru_ref, lru_ref); 233962306a36Sopenharmony_ci} 234062306a36Sopenharmony_ci 234162306a36Sopenharmony_ci/* 234262306a36Sopenharmony_ci * Verify an on-disk magic value against the magic value specified in the 234362306a36Sopenharmony_ci * verifier structure. The verifier magic is in disk byte order so the caller is 234462306a36Sopenharmony_ci * expected to pass the value directly from disk. 234562306a36Sopenharmony_ci */ 234662306a36Sopenharmony_cibool 234762306a36Sopenharmony_cixfs_verify_magic( 234862306a36Sopenharmony_ci struct xfs_buf *bp, 234962306a36Sopenharmony_ci __be32 dmagic) 235062306a36Sopenharmony_ci{ 235162306a36Sopenharmony_ci struct xfs_mount *mp = bp->b_mount; 235262306a36Sopenharmony_ci int idx; 235362306a36Sopenharmony_ci 235462306a36Sopenharmony_ci idx = xfs_has_crc(mp); 235562306a36Sopenharmony_ci if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 235662306a36Sopenharmony_ci return false; 235762306a36Sopenharmony_ci return dmagic == bp->b_ops->magic[idx]; 235862306a36Sopenharmony_ci} 235962306a36Sopenharmony_ci/* 236062306a36Sopenharmony_ci * Verify an on-disk magic value against the magic value specified in the 236162306a36Sopenharmony_ci * verifier structure. The verifier magic is in disk byte order so the caller is 236262306a36Sopenharmony_ci * expected to pass the value directly from disk. 236362306a36Sopenharmony_ci */ 236462306a36Sopenharmony_cibool 236562306a36Sopenharmony_cixfs_verify_magic16( 236662306a36Sopenharmony_ci struct xfs_buf *bp, 236762306a36Sopenharmony_ci __be16 dmagic) 236862306a36Sopenharmony_ci{ 236962306a36Sopenharmony_ci struct xfs_mount *mp = bp->b_mount; 237062306a36Sopenharmony_ci int idx; 237162306a36Sopenharmony_ci 237262306a36Sopenharmony_ci idx = xfs_has_crc(mp); 237362306a36Sopenharmony_ci if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 237462306a36Sopenharmony_ci return false; 237562306a36Sopenharmony_ci return dmagic == bp->b_ops->magic16[idx]; 237662306a36Sopenharmony_ci} 2377