162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2010, 2023 Red Hat, Inc. 462306a36Sopenharmony_ci * All Rights Reserved. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include "xfs.h" 762306a36Sopenharmony_ci#include "xfs_shared.h" 862306a36Sopenharmony_ci#include "xfs_format.h" 962306a36Sopenharmony_ci#include "xfs_log_format.h" 1062306a36Sopenharmony_ci#include "xfs_trans_resv.h" 1162306a36Sopenharmony_ci#include "xfs_mount.h" 1262306a36Sopenharmony_ci#include "xfs_btree.h" 1362306a36Sopenharmony_ci#include "xfs_alloc_btree.h" 1462306a36Sopenharmony_ci#include "xfs_alloc.h" 1562306a36Sopenharmony_ci#include "xfs_discard.h" 1662306a36Sopenharmony_ci#include "xfs_error.h" 1762306a36Sopenharmony_ci#include "xfs_extent_busy.h" 1862306a36Sopenharmony_ci#include "xfs_trace.h" 1962306a36Sopenharmony_ci#include "xfs_log.h" 2062306a36Sopenharmony_ci#include "xfs_ag.h" 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci/* 2362306a36Sopenharmony_ci * Notes on an efficient, low latency fstrim algorithm 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * We need to walk the filesystem free space and issue discards on the free 2662306a36Sopenharmony_ci * space that meet the search criteria (size and location). We cannot issue 2762306a36Sopenharmony_ci * discards on extents that might be in use, or are so recently in use they are 2862306a36Sopenharmony_ci * still marked as busy. To serialise against extent state changes whilst we are 2962306a36Sopenharmony_ci * gathering extents to trim, we must hold the AGF lock to lock out other 3062306a36Sopenharmony_ci * allocations and extent free operations that might change extent state. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci * However, we cannot just hold the AGF for the entire AG free space walk whilst 3362306a36Sopenharmony_ci * we issue discards on each free space that is found. Storage devices can have 3462306a36Sopenharmony_ci * extremely slow discard implementations (e.g. ceph RBD) and so walking a 3562306a36Sopenharmony_ci * couple of million free extents and issuing synchronous discards on each 3662306a36Sopenharmony_ci * extent can take a *long* time. Whilst we are doing this walk, nothing else 3762306a36Sopenharmony_ci * can access the AGF, and we can stall transactions and hence the log whilst 3862306a36Sopenharmony_ci * modifications wait for the AGF lock to be released. This can lead hung tasks 3962306a36Sopenharmony_ci * kicking the hung task timer and rebooting the system. This is bad. 4062306a36Sopenharmony_ci * 4162306a36Sopenharmony_ci * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 4262306a36Sopenharmony_ci * lock, gathers a range of inode cluster buffers that are allocated, drops the 4362306a36Sopenharmony_ci * AGI lock and then reads all the inode cluster buffers and processes them. It 4462306a36Sopenharmony_ci * loops doing this, using a cursor to keep track of where it is up to in the AG 4562306a36Sopenharmony_ci * for each iteration to restart the INOBT lookup from. 4662306a36Sopenharmony_ci * 4762306a36Sopenharmony_ci * We can't do this exactly with free space - once we drop the AGF lock, the 4862306a36Sopenharmony_ci * state of the free extent is out of our control and we cannot run a discard 4962306a36Sopenharmony_ci * safely on it in this situation. Unless, of course, we've marked the free 5062306a36Sopenharmony_ci * extent as busy and undergoing a discard operation whilst we held the AGF 5162306a36Sopenharmony_ci * locked. 5262306a36Sopenharmony_ci * 5362306a36Sopenharmony_ci * This is exactly how online discard works - free extents are marked busy when 5462306a36Sopenharmony_ci * they are freed, and once the extent free has been committed to the journal, 5562306a36Sopenharmony_ci * the busy extent record is marked as "undergoing discard" and the discard is 5662306a36Sopenharmony_ci * then issued on the free extent. Once the discard completes, the busy extent 5762306a36Sopenharmony_ci * record is removed and the extent is able to be allocated again. 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * In the context of fstrim, if we find a free extent we need to discard, we 6062306a36Sopenharmony_ci * don't have to discard it immediately. All we need to do it record that free 6162306a36Sopenharmony_ci * extent as being busy and under discard, and all the allocation routines will 6262306a36Sopenharmony_ci * now avoid trying to allocate it. Hence if we mark the extent as busy under 6362306a36Sopenharmony_ci * the AGF lock, we can safely discard it without holding the AGF lock because 6462306a36Sopenharmony_ci * nothing will attempt to allocate that free space until the discard completes. 6562306a36Sopenharmony_ci * 6662306a36Sopenharmony_ci * This also allows us to issue discards asynchronously like we do with online 6762306a36Sopenharmony_ci * discard, and so for fast devices fstrim will run much faster as we can have 6862306a36Sopenharmony_ci * multiple discard operations in flight at once, as well as pipeline the free 6962306a36Sopenharmony_ci * extent search so that it overlaps in flight discard IO. 7062306a36Sopenharmony_ci */ 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_cistruct workqueue_struct *xfs_discard_wq; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_cistatic void 7562306a36Sopenharmony_cixfs_discard_endio_work( 7662306a36Sopenharmony_ci struct work_struct *work) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci struct xfs_busy_extents *extents = 7962306a36Sopenharmony_ci container_of(work, struct xfs_busy_extents, endio_work); 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); 8262306a36Sopenharmony_ci kmem_free(extents->owner); 8362306a36Sopenharmony_ci} 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci/* 8662306a36Sopenharmony_ci * Queue up the actual completion to a thread to avoid IRQ-safe locking for 8762306a36Sopenharmony_ci * pagb_lock. 8862306a36Sopenharmony_ci */ 8962306a36Sopenharmony_cistatic void 9062306a36Sopenharmony_cixfs_discard_endio( 9162306a36Sopenharmony_ci struct bio *bio) 9262306a36Sopenharmony_ci{ 9362306a36Sopenharmony_ci struct xfs_busy_extents *extents = bio->bi_private; 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 9662306a36Sopenharmony_ci queue_work(xfs_discard_wq, &extents->endio_work); 9762306a36Sopenharmony_ci bio_put(bio); 9862306a36Sopenharmony_ci} 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci/* 10162306a36Sopenharmony_ci * Walk the discard list and issue discards on all the busy extents in the 10262306a36Sopenharmony_ci * list. We plug and chain the bios so that we only need a single completion 10362306a36Sopenharmony_ci * call to clear all the busy extents once the discards are complete. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ciint 10662306a36Sopenharmony_cixfs_discard_extents( 10762306a36Sopenharmony_ci struct xfs_mount *mp, 10862306a36Sopenharmony_ci struct xfs_busy_extents *extents) 10962306a36Sopenharmony_ci{ 11062306a36Sopenharmony_ci struct xfs_extent_busy *busyp; 11162306a36Sopenharmony_ci struct bio *bio = NULL; 11262306a36Sopenharmony_ci struct blk_plug plug; 11362306a36Sopenharmony_ci int error = 0; 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci blk_start_plug(&plug); 11662306a36Sopenharmony_ci list_for_each_entry(busyp, &extents->extent_list, list) { 11762306a36Sopenharmony_ci trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, 11862306a36Sopenharmony_ci busyp->length); 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, 12162306a36Sopenharmony_ci XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), 12262306a36Sopenharmony_ci XFS_FSB_TO_BB(mp, busyp->length), 12362306a36Sopenharmony_ci GFP_NOFS, &bio); 12462306a36Sopenharmony_ci if (error && error != -EOPNOTSUPP) { 12562306a36Sopenharmony_ci xfs_info(mp, 12662306a36Sopenharmony_ci "discard failed for extent [0x%llx,%u], error %d", 12762306a36Sopenharmony_ci (unsigned long long)busyp->bno, 12862306a36Sopenharmony_ci busyp->length, 12962306a36Sopenharmony_ci error); 13062306a36Sopenharmony_ci break; 13162306a36Sopenharmony_ci } 13262306a36Sopenharmony_ci } 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci if (bio) { 13562306a36Sopenharmony_ci bio->bi_private = extents; 13662306a36Sopenharmony_ci bio->bi_end_io = xfs_discard_endio; 13762306a36Sopenharmony_ci submit_bio(bio); 13862306a36Sopenharmony_ci } else { 13962306a36Sopenharmony_ci xfs_discard_endio_work(&extents->endio_work); 14062306a36Sopenharmony_ci } 14162306a36Sopenharmony_ci blk_finish_plug(&plug); 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci return error; 14462306a36Sopenharmony_ci} 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_cistatic int 14862306a36Sopenharmony_cixfs_trim_gather_extents( 14962306a36Sopenharmony_ci struct xfs_perag *pag, 15062306a36Sopenharmony_ci xfs_daddr_t start, 15162306a36Sopenharmony_ci xfs_daddr_t end, 15262306a36Sopenharmony_ci xfs_daddr_t minlen, 15362306a36Sopenharmony_ci struct xfs_alloc_rec_incore *tcur, 15462306a36Sopenharmony_ci struct xfs_busy_extents *extents, 15562306a36Sopenharmony_ci uint64_t *blocks_trimmed) 15662306a36Sopenharmony_ci{ 15762306a36Sopenharmony_ci struct xfs_mount *mp = pag->pag_mount; 15862306a36Sopenharmony_ci struct xfs_btree_cur *cur; 15962306a36Sopenharmony_ci struct xfs_buf *agbp; 16062306a36Sopenharmony_ci int error; 16162306a36Sopenharmony_ci int i; 16262306a36Sopenharmony_ci int batch = 100; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci /* 16562306a36Sopenharmony_ci * Force out the log. This means any transactions that might have freed 16662306a36Sopenharmony_ci * space before we take the AGF buffer lock are now on disk, and the 16762306a36Sopenharmony_ci * volatile disk cache is flushed. 16862306a36Sopenharmony_ci */ 16962306a36Sopenharmony_ci xfs_log_force(mp, XFS_LOG_SYNC); 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); 17262306a36Sopenharmony_ci if (error) 17362306a36Sopenharmony_ci return error; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci /* 17862306a36Sopenharmony_ci * Look up the extent length requested in the AGF and start with it. 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci if (tcur->ar_startblock == NULLAGBLOCK) 18162306a36Sopenharmony_ci error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); 18262306a36Sopenharmony_ci else 18362306a36Sopenharmony_ci error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, 18462306a36Sopenharmony_ci tcur->ar_blockcount, &i); 18562306a36Sopenharmony_ci if (error) 18662306a36Sopenharmony_ci goto out_del_cursor; 18762306a36Sopenharmony_ci if (i == 0) { 18862306a36Sopenharmony_ci /* nothing of that length left in the AG, we are done */ 18962306a36Sopenharmony_ci tcur->ar_blockcount = 0; 19062306a36Sopenharmony_ci goto out_del_cursor; 19162306a36Sopenharmony_ci } 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci /* 19462306a36Sopenharmony_ci * Loop until we are done with all extents that are large 19562306a36Sopenharmony_ci * enough to be worth discarding or we hit batch limits. 19662306a36Sopenharmony_ci */ 19762306a36Sopenharmony_ci while (i) { 19862306a36Sopenharmony_ci xfs_agblock_t fbno; 19962306a36Sopenharmony_ci xfs_extlen_t flen; 20062306a36Sopenharmony_ci xfs_daddr_t dbno; 20162306a36Sopenharmony_ci xfs_extlen_t dlen; 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 20462306a36Sopenharmony_ci if (error) 20562306a36Sopenharmony_ci break; 20662306a36Sopenharmony_ci if (XFS_IS_CORRUPT(mp, i != 1)) { 20762306a36Sopenharmony_ci error = -EFSCORRUPTED; 20862306a36Sopenharmony_ci break; 20962306a36Sopenharmony_ci } 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci if (--batch <= 0) { 21262306a36Sopenharmony_ci /* 21362306a36Sopenharmony_ci * Update the cursor to point at this extent so we 21462306a36Sopenharmony_ci * restart the next batch from this extent. 21562306a36Sopenharmony_ci */ 21662306a36Sopenharmony_ci tcur->ar_startblock = fbno; 21762306a36Sopenharmony_ci tcur->ar_blockcount = flen; 21862306a36Sopenharmony_ci break; 21962306a36Sopenharmony_ci } 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci /* 22262306a36Sopenharmony_ci * use daddr format for all range/len calculations as that is 22362306a36Sopenharmony_ci * the format the range/len variables are supplied in by 22462306a36Sopenharmony_ci * userspace. 22562306a36Sopenharmony_ci */ 22662306a36Sopenharmony_ci dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno); 22762306a36Sopenharmony_ci dlen = XFS_FSB_TO_BB(mp, flen); 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci /* 23062306a36Sopenharmony_ci * Too small? Give up. 23162306a36Sopenharmony_ci */ 23262306a36Sopenharmony_ci if (dlen < minlen) { 23362306a36Sopenharmony_ci trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); 23462306a36Sopenharmony_ci tcur->ar_blockcount = 0; 23562306a36Sopenharmony_ci break; 23662306a36Sopenharmony_ci } 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci /* 23962306a36Sopenharmony_ci * If the extent is entirely outside of the range we are 24062306a36Sopenharmony_ci * supposed to discard skip it. Do not bother to trim 24162306a36Sopenharmony_ci * down partially overlapping ranges for now. 24262306a36Sopenharmony_ci */ 24362306a36Sopenharmony_ci if (dbno + dlen < start || dbno > end) { 24462306a36Sopenharmony_ci trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); 24562306a36Sopenharmony_ci goto next_extent; 24662306a36Sopenharmony_ci } 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci /* 24962306a36Sopenharmony_ci * If any blocks in the range are still busy, skip the 25062306a36Sopenharmony_ci * discard and try again the next time. 25162306a36Sopenharmony_ci */ 25262306a36Sopenharmony_ci if (xfs_extent_busy_search(mp, pag, fbno, flen)) { 25362306a36Sopenharmony_ci trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen); 25462306a36Sopenharmony_ci goto next_extent; 25562306a36Sopenharmony_ci } 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci xfs_extent_busy_insert_discard(pag, fbno, flen, 25862306a36Sopenharmony_ci &extents->extent_list); 25962306a36Sopenharmony_ci *blocks_trimmed += flen; 26062306a36Sopenharmony_cinext_extent: 26162306a36Sopenharmony_ci error = xfs_btree_decrement(cur, 0, &i); 26262306a36Sopenharmony_ci if (error) 26362306a36Sopenharmony_ci break; 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci /* 26662306a36Sopenharmony_ci * If there's no more records in the tree, we are done. Set the 26762306a36Sopenharmony_ci * cursor block count to 0 to indicate to the caller that there 26862306a36Sopenharmony_ci * is no more extents to search. 26962306a36Sopenharmony_ci */ 27062306a36Sopenharmony_ci if (i == 0) 27162306a36Sopenharmony_ci tcur->ar_blockcount = 0; 27262306a36Sopenharmony_ci } 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci /* 27562306a36Sopenharmony_ci * If there was an error, release all the gathered busy extents because 27662306a36Sopenharmony_ci * we aren't going to issue a discard on them any more. 27762306a36Sopenharmony_ci */ 27862306a36Sopenharmony_ci if (error) 27962306a36Sopenharmony_ci xfs_extent_busy_clear(mp, &extents->extent_list, false); 28062306a36Sopenharmony_ciout_del_cursor: 28162306a36Sopenharmony_ci xfs_btree_del_cursor(cur, error); 28262306a36Sopenharmony_ci xfs_buf_relse(agbp); 28362306a36Sopenharmony_ci return error; 28462306a36Sopenharmony_ci} 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_cistatic bool 28762306a36Sopenharmony_cixfs_trim_should_stop(void) 28862306a36Sopenharmony_ci{ 28962306a36Sopenharmony_ci return fatal_signal_pending(current) || freezing(current); 29062306a36Sopenharmony_ci} 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci/* 29362306a36Sopenharmony_ci * Iterate the free list gathering extents and discarding them. We need a cursor 29462306a36Sopenharmony_ci * for the repeated iteration of gather/discard loop, so use the longest extent 29562306a36Sopenharmony_ci * we found in the last batch as the key to start the next. 29662306a36Sopenharmony_ci */ 29762306a36Sopenharmony_cistatic int 29862306a36Sopenharmony_cixfs_trim_extents( 29962306a36Sopenharmony_ci struct xfs_perag *pag, 30062306a36Sopenharmony_ci xfs_daddr_t start, 30162306a36Sopenharmony_ci xfs_daddr_t end, 30262306a36Sopenharmony_ci xfs_daddr_t minlen, 30362306a36Sopenharmony_ci uint64_t *blocks_trimmed) 30462306a36Sopenharmony_ci{ 30562306a36Sopenharmony_ci struct xfs_alloc_rec_incore tcur = { 30662306a36Sopenharmony_ci .ar_blockcount = pag->pagf_longest, 30762306a36Sopenharmony_ci .ar_startblock = NULLAGBLOCK, 30862306a36Sopenharmony_ci }; 30962306a36Sopenharmony_ci int error = 0; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci do { 31262306a36Sopenharmony_ci struct xfs_busy_extents *extents; 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci extents = kzalloc(sizeof(*extents), GFP_KERNEL); 31562306a36Sopenharmony_ci if (!extents) { 31662306a36Sopenharmony_ci error = -ENOMEM; 31762306a36Sopenharmony_ci break; 31862306a36Sopenharmony_ci } 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci extents->mount = pag->pag_mount; 32162306a36Sopenharmony_ci extents->owner = extents; 32262306a36Sopenharmony_ci INIT_LIST_HEAD(&extents->extent_list); 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci error = xfs_trim_gather_extents(pag, start, end, minlen, 32562306a36Sopenharmony_ci &tcur, extents, blocks_trimmed); 32662306a36Sopenharmony_ci if (error) { 32762306a36Sopenharmony_ci kfree(extents); 32862306a36Sopenharmony_ci break; 32962306a36Sopenharmony_ci } 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci /* 33262306a36Sopenharmony_ci * We hand the extent list to the discard function here so the 33362306a36Sopenharmony_ci * discarded extents can be removed from the busy extent list. 33462306a36Sopenharmony_ci * This allows the discards to run asynchronously with gathering 33562306a36Sopenharmony_ci * the next round of extents to discard. 33662306a36Sopenharmony_ci * 33762306a36Sopenharmony_ci * However, we must ensure that we do not reference the extent 33862306a36Sopenharmony_ci * list after this function call, as it may have been freed by 33962306a36Sopenharmony_ci * the time control returns to us. 34062306a36Sopenharmony_ci */ 34162306a36Sopenharmony_ci error = xfs_discard_extents(pag->pag_mount, extents); 34262306a36Sopenharmony_ci if (error) 34362306a36Sopenharmony_ci break; 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci if (xfs_trim_should_stop()) 34662306a36Sopenharmony_ci break; 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci } while (tcur.ar_blockcount != 0); 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci return error; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci/* 35562306a36Sopenharmony_ci * trim a range of the filesystem. 35662306a36Sopenharmony_ci * 35762306a36Sopenharmony_ci * Note: the parameters passed from userspace are byte ranges into the 35862306a36Sopenharmony_ci * filesystem which does not match to the format we use for filesystem block 35962306a36Sopenharmony_ci * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format 36062306a36Sopenharmony_ci * is a linear address range. Hence we need to use DADDR based conversions and 36162306a36Sopenharmony_ci * comparisons for determining the correct offset and regions to trim. 36262306a36Sopenharmony_ci */ 36362306a36Sopenharmony_ciint 36462306a36Sopenharmony_cixfs_ioc_trim( 36562306a36Sopenharmony_ci struct xfs_mount *mp, 36662306a36Sopenharmony_ci struct fstrim_range __user *urange) 36762306a36Sopenharmony_ci{ 36862306a36Sopenharmony_ci struct xfs_perag *pag; 36962306a36Sopenharmony_ci unsigned int granularity = 37062306a36Sopenharmony_ci bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); 37162306a36Sopenharmony_ci struct fstrim_range range; 37262306a36Sopenharmony_ci xfs_daddr_t start, end, minlen; 37362306a36Sopenharmony_ci xfs_agnumber_t agno; 37462306a36Sopenharmony_ci uint64_t blocks_trimmed = 0; 37562306a36Sopenharmony_ci int error, last_error = 0; 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci if (!capable(CAP_SYS_ADMIN)) 37862306a36Sopenharmony_ci return -EPERM; 37962306a36Sopenharmony_ci if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) 38062306a36Sopenharmony_ci return -EOPNOTSUPP; 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci /* 38362306a36Sopenharmony_ci * We haven't recovered the log, so we cannot use our bnobt-guided 38462306a36Sopenharmony_ci * storage zapping commands. 38562306a36Sopenharmony_ci */ 38662306a36Sopenharmony_ci if (xfs_has_norecovery(mp)) 38762306a36Sopenharmony_ci return -EROFS; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci if (copy_from_user(&range, urange, sizeof(range))) 39062306a36Sopenharmony_ci return -EFAULT; 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci range.minlen = max_t(u64, granularity, range.minlen); 39362306a36Sopenharmony_ci minlen = BTOBB(range.minlen); 39462306a36Sopenharmony_ci /* 39562306a36Sopenharmony_ci * Truncating down the len isn't actually quite correct, but using 39662306a36Sopenharmony_ci * BBTOB would mean we trivially get overflows for values 39762306a36Sopenharmony_ci * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 39862306a36Sopenharmony_ci * used by the fstrim application. In the end it really doesn't 39962306a36Sopenharmony_ci * matter as trimming blocks is an advisory interface. 40062306a36Sopenharmony_ci */ 40162306a36Sopenharmony_ci if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || 40262306a36Sopenharmony_ci range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || 40362306a36Sopenharmony_ci range.len < mp->m_sb.sb_blocksize) 40462306a36Sopenharmony_ci return -EINVAL; 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci start = BTOBB(range.start); 40762306a36Sopenharmony_ci end = start + BTOBBT(range.len) - 1; 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) 41062306a36Sopenharmony_ci end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1; 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci agno = xfs_daddr_to_agno(mp, start); 41362306a36Sopenharmony_ci for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { 41462306a36Sopenharmony_ci error = xfs_trim_extents(pag, start, end, minlen, 41562306a36Sopenharmony_ci &blocks_trimmed); 41662306a36Sopenharmony_ci if (error) 41762306a36Sopenharmony_ci last_error = error; 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci if (xfs_trim_should_stop()) { 42062306a36Sopenharmony_ci xfs_perag_rele(pag); 42162306a36Sopenharmony_ci break; 42262306a36Sopenharmony_ci } 42362306a36Sopenharmony_ci } 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci if (last_error) 42662306a36Sopenharmony_ci return last_error; 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci range.len = XFS_FSB_TO_B(mp, blocks_trimmed); 42962306a36Sopenharmony_ci if (copy_to_user(urange, &range, sizeof(range))) 43062306a36Sopenharmony_ci return -EFAULT; 43162306a36Sopenharmony_ci return 0; 43262306a36Sopenharmony_ci} 433