1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6#include "xfs.h" 7#include "xfs_fs.h" 8#include "xfs_shared.h" 9#include "xfs_format.h" 10#include "xfs_log_format.h" 11#include "xfs_trans_resv.h" 12#include "xfs_sb.h" 13#include "xfs_mount.h" 14#include "xfs_inode.h" 15#include "xfs_trans.h" 16#include "xfs_trans_priv.h" 17#include "xfs_inode_item.h" 18#include "xfs_quota.h" 19#include "xfs_trace.h" 20#include "xfs_icache.h" 21#include "xfs_bmap_util.h" 22#include "xfs_dquot_item.h" 23#include "xfs_dquot.h" 24#include "xfs_reflink.h" 25#include "xfs_ialloc.h" 26 27#include <linux/iversion.h> 28 29/* 30 * Allocate and initialise an xfs_inode. 31 */ 32struct xfs_inode * 33xfs_inode_alloc( 34 struct xfs_mount *mp, 35 xfs_ino_t ino) 36{ 37 struct xfs_inode *ip; 38 39 /* 40 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 41 * and return NULL here on ENOMEM. 42 */ 43 ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); 44 45 if (inode_init_always(mp->m_super, VFS_I(ip))) { 46 kmem_cache_free(xfs_inode_zone, ip); 47 return NULL; 48 } 49 50 /* VFS doesn't initialise i_mode or i_state! */ 51 VFS_I(ip)->i_mode = 0; 52 VFS_I(ip)->i_state = 0; 53 54 XFS_STATS_INC(mp, vn_active); 55 ASSERT(atomic_read(&ip->i_pincount) == 0); 56 ASSERT(ip->i_ino == 0); 57 58 /* initialise the xfs inode */ 59 ip->i_ino = ino; 60 ip->i_mount = mp; 61 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 62 ip->i_afp = NULL; 63 ip->i_cowfp = NULL; 64 memset(&ip->i_df, 0, sizeof(ip->i_df)); 65 ip->i_flags = 0; 66 ip->i_delayed_blks = 0; 67 memset(&ip->i_d, 0, sizeof(ip->i_d)); 68 ip->i_sick = 0; 69 ip->i_checked = 0; 70 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 71 INIT_LIST_HEAD(&ip->i_ioend_list); 72 spin_lock_init(&ip->i_ioend_lock); 73 74 return ip; 75} 76 77STATIC void 78xfs_inode_free_callback( 79 struct rcu_head *head) 80{ 81 struct inode *inode = container_of(head, struct inode, i_rcu); 82 struct xfs_inode *ip = XFS_I(inode); 83 84 switch (VFS_I(ip)->i_mode & S_IFMT) { 85 case S_IFREG: 86 case S_IFDIR: 87 case S_IFLNK: 88 xfs_idestroy_fork(&ip->i_df); 89 break; 90 } 91 92 if (ip->i_afp) { 93 xfs_idestroy_fork(ip->i_afp); 94 kmem_cache_free(xfs_ifork_zone, ip->i_afp); 95 } 96 if (ip->i_cowfp) { 97 xfs_idestroy_fork(ip->i_cowfp); 98 kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); 99 } 100 if (ip->i_itemp) { 101 ASSERT(!test_bit(XFS_LI_IN_AIL, 102 &ip->i_itemp->ili_item.li_flags)); 103 xfs_inode_item_destroy(ip); 104 ip->i_itemp = NULL; 105 } 106 107 kmem_cache_free(xfs_inode_zone, ip); 108} 109 110static void 111__xfs_inode_free( 112 struct xfs_inode *ip) 113{ 114 /* asserts to verify all state is correct here */ 115 ASSERT(atomic_read(&ip->i_pincount) == 0); 116 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 117 XFS_STATS_DEC(ip->i_mount, vn_active); 118 119 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 120} 121 122void 123xfs_inode_free( 124 struct xfs_inode *ip) 125{ 126 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 127 128 /* 129 * Because we use RCU freeing we need to ensure the inode always 130 * appears to be reclaimed with an invalid inode number when in the 131 * free state. The ip->i_flags_lock provides the barrier against lookup 132 * races. 133 */ 134 spin_lock(&ip->i_flags_lock); 135 ip->i_flags = XFS_IRECLAIM; 136 ip->i_ino = 0; 137 spin_unlock(&ip->i_flags_lock); 138 139 __xfs_inode_free(ip); 140} 141 142/* 143 * Queue background inode reclaim work if there are reclaimable inodes and there 144 * isn't reclaim work already scheduled or in progress. 145 */ 146static void 147xfs_reclaim_work_queue( 148 struct xfs_mount *mp) 149{ 150 151 rcu_read_lock(); 152 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 153 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 154 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 155 } 156 rcu_read_unlock(); 157} 158 159static void 160xfs_perag_set_reclaim_tag( 161 struct xfs_perag *pag) 162{ 163 struct xfs_mount *mp = pag->pag_mount; 164 165 lockdep_assert_held(&pag->pag_ici_lock); 166 if (pag->pag_ici_reclaimable++) 167 return; 168 169 /* propagate the reclaim tag up into the perag radix tree */ 170 spin_lock(&mp->m_perag_lock); 171 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 172 XFS_ICI_RECLAIM_TAG); 173 spin_unlock(&mp->m_perag_lock); 174 175 /* schedule periodic background inode reclaim */ 176 xfs_reclaim_work_queue(mp); 177 178 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 179} 180 181static void 182xfs_perag_clear_reclaim_tag( 183 struct xfs_perag *pag) 184{ 185 struct xfs_mount *mp = pag->pag_mount; 186 187 lockdep_assert_held(&pag->pag_ici_lock); 188 if (--pag->pag_ici_reclaimable) 189 return; 190 191 /* clear the reclaim tag from the perag radix tree */ 192 spin_lock(&mp->m_perag_lock); 193 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 194 XFS_ICI_RECLAIM_TAG); 195 spin_unlock(&mp->m_perag_lock); 196 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 197} 198 199 200/* 201 * We set the inode flag atomically with the radix tree tag. 202 * Once we get tag lookups on the radix tree, this inode flag 203 * can go away. 204 */ 205void 206xfs_inode_set_reclaim_tag( 207 struct xfs_inode *ip) 208{ 209 struct xfs_mount *mp = ip->i_mount; 210 struct xfs_perag *pag; 211 212 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 213 spin_lock(&pag->pag_ici_lock); 214 spin_lock(&ip->i_flags_lock); 215 216 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 217 XFS_ICI_RECLAIM_TAG); 218 xfs_perag_set_reclaim_tag(pag); 219 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 220 221 spin_unlock(&ip->i_flags_lock); 222 spin_unlock(&pag->pag_ici_lock); 223 xfs_perag_put(pag); 224} 225 226STATIC void 227xfs_inode_clear_reclaim_tag( 228 struct xfs_perag *pag, 229 xfs_ino_t ino) 230{ 231 radix_tree_tag_clear(&pag->pag_ici_root, 232 XFS_INO_TO_AGINO(pag->pag_mount, ino), 233 XFS_ICI_RECLAIM_TAG); 234 xfs_perag_clear_reclaim_tag(pag); 235} 236 237static void 238xfs_inew_wait( 239 struct xfs_inode *ip) 240{ 241 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 242 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 243 244 do { 245 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 246 if (!xfs_iflags_test(ip, XFS_INEW)) 247 break; 248 schedule(); 249 } while (true); 250 finish_wait(wq, &wait.wq_entry); 251} 252 253/* 254 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 255 * part of the structure. This is made more complex by the fact we store 256 * information about the on-disk values in the VFS inode and so we can't just 257 * overwrite the values unconditionally. Hence we save the parameters we 258 * need to retain across reinitialisation, and rewrite them into the VFS inode 259 * after reinitialisation even if it fails. 260 */ 261static int 262xfs_reinit_inode( 263 struct xfs_mount *mp, 264 struct inode *inode) 265{ 266 int error; 267 uint32_t nlink = inode->i_nlink; 268 uint32_t generation = inode->i_generation; 269 uint64_t version = inode_peek_iversion(inode); 270 umode_t mode = inode->i_mode; 271 dev_t dev = inode->i_rdev; 272 kuid_t uid = inode->i_uid; 273 kgid_t gid = inode->i_gid; 274 275 error = inode_init_always(mp->m_super, inode); 276 277 set_nlink(inode, nlink); 278 inode->i_generation = generation; 279 inode_set_iversion_queried(inode, version); 280 inode->i_mode = mode; 281 inode->i_rdev = dev; 282 inode->i_uid = uid; 283 inode->i_gid = gid; 284 return error; 285} 286 287/* 288 * If we are allocating a new inode, then check what was returned is 289 * actually a free, empty inode. If we are not allocating an inode, 290 * then check we didn't find a free inode. 291 * 292 * Returns: 293 * 0 if the inode free state matches the lookup context 294 * -ENOENT if the inode is free and we are not allocating 295 * -EFSCORRUPTED if there is any state mismatch at all 296 */ 297static int 298xfs_iget_check_free_state( 299 struct xfs_inode *ip, 300 int flags) 301{ 302 if (flags & XFS_IGET_CREATE) { 303 /* should be a free inode */ 304 if (VFS_I(ip)->i_mode != 0) { 305 xfs_warn(ip->i_mount, 306"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 307 ip->i_ino, VFS_I(ip)->i_mode); 308 return -EFSCORRUPTED; 309 } 310 311 if (ip->i_d.di_nblocks != 0) { 312 xfs_warn(ip->i_mount, 313"Corruption detected! Free inode 0x%llx has blocks allocated!", 314 ip->i_ino); 315 return -EFSCORRUPTED; 316 } 317 return 0; 318 } 319 320 /* should be an allocated inode */ 321 if (VFS_I(ip)->i_mode == 0) 322 return -ENOENT; 323 324 return 0; 325} 326 327/* 328 * Check the validity of the inode we just found it the cache 329 */ 330static int 331xfs_iget_cache_hit( 332 struct xfs_perag *pag, 333 struct xfs_inode *ip, 334 xfs_ino_t ino, 335 int flags, 336 int lock_flags) __releases(RCU) 337{ 338 struct inode *inode = VFS_I(ip); 339 struct xfs_mount *mp = ip->i_mount; 340 int error; 341 342 /* 343 * check for re-use of an inode within an RCU grace period due to the 344 * radix tree nodes not being updated yet. We monitor for this by 345 * setting the inode number to zero before freeing the inode structure. 346 * If the inode has been reallocated and set up, then the inode number 347 * will not match, so check for that, too. 348 */ 349 spin_lock(&ip->i_flags_lock); 350 if (ip->i_ino != ino) { 351 trace_xfs_iget_skip(ip); 352 XFS_STATS_INC(mp, xs_ig_frecycle); 353 error = -EAGAIN; 354 goto out_error; 355 } 356 357 358 /* 359 * If we are racing with another cache hit that is currently 360 * instantiating this inode or currently recycling it out of 361 * reclaimabe state, wait for the initialisation to complete 362 * before continuing. 363 * 364 * XXX(hch): eventually we should do something equivalent to 365 * wait_on_inode to wait for these flags to be cleared 366 * instead of polling for it. 367 */ 368 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 369 trace_xfs_iget_skip(ip); 370 XFS_STATS_INC(mp, xs_ig_frecycle); 371 error = -EAGAIN; 372 goto out_error; 373 } 374 375 /* 376 * Check the inode free state is valid. This also detects lookup 377 * racing with unlinks. 378 */ 379 error = xfs_iget_check_free_state(ip, flags); 380 if (error) 381 goto out_error; 382 383 /* 384 * If IRECLAIMABLE is set, we've torn down the VFS inode already. 385 * Need to carefully get it back into useable state. 386 */ 387 if (ip->i_flags & XFS_IRECLAIMABLE) { 388 trace_xfs_iget_reclaim(ip); 389 390 if (flags & XFS_IGET_INCORE) { 391 error = -EAGAIN; 392 goto out_error; 393 } 394 395 /* 396 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 397 * from stomping over us while we recycle the inode. We can't 398 * clear the radix tree reclaimable tag yet as it requires 399 * pag_ici_lock to be held exclusive. 400 */ 401 ip->i_flags |= XFS_IRECLAIM; 402 403 spin_unlock(&ip->i_flags_lock); 404 rcu_read_unlock(); 405 406 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 407 error = xfs_reinit_inode(mp, inode); 408 if (error) { 409 bool wake; 410 /* 411 * Re-initializing the inode failed, and we are in deep 412 * trouble. Try to re-add it to the reclaim list. 413 */ 414 rcu_read_lock(); 415 spin_lock(&ip->i_flags_lock); 416 wake = !!__xfs_iflags_test(ip, XFS_INEW); 417 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 418 if (wake) 419 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 420 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 421 trace_xfs_iget_reclaim_fail(ip); 422 goto out_error; 423 } 424 425 spin_lock(&pag->pag_ici_lock); 426 spin_lock(&ip->i_flags_lock); 427 428 /* 429 * Clear the per-lifetime state in the inode as we are now 430 * effectively a new inode and need to return to the initial 431 * state before reuse occurs. 432 */ 433 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 434 ip->i_flags |= XFS_INEW; 435 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 436 inode->i_state = I_NEW; 437 ip->i_sick = 0; 438 ip->i_checked = 0; 439 440 spin_unlock(&ip->i_flags_lock); 441 spin_unlock(&pag->pag_ici_lock); 442 } else { 443 /* If the VFS inode is being torn down, pause and try again. */ 444 if (!igrab(inode)) { 445 trace_xfs_iget_skip(ip); 446 error = -EAGAIN; 447 goto out_error; 448 } 449 450 /* We've got a live one. */ 451 spin_unlock(&ip->i_flags_lock); 452 rcu_read_unlock(); 453 trace_xfs_iget_hit(ip); 454 } 455 456 if (lock_flags != 0) 457 xfs_ilock(ip, lock_flags); 458 459 if (!(flags & XFS_IGET_INCORE)) 460 xfs_iflags_clear(ip, XFS_ISTALE); 461 XFS_STATS_INC(mp, xs_ig_found); 462 463 return 0; 464 465out_error: 466 spin_unlock(&ip->i_flags_lock); 467 rcu_read_unlock(); 468 return error; 469} 470 471 472static int 473xfs_iget_cache_miss( 474 struct xfs_mount *mp, 475 struct xfs_perag *pag, 476 xfs_trans_t *tp, 477 xfs_ino_t ino, 478 struct xfs_inode **ipp, 479 int flags, 480 int lock_flags) 481{ 482 struct xfs_inode *ip; 483 int error; 484 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 485 int iflags; 486 487 ip = xfs_inode_alloc(mp, ino); 488 if (!ip) 489 return -ENOMEM; 490 491 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 492 if (error) 493 goto out_destroy; 494 495 /* 496 * For version 5 superblocks, if we are initialising a new inode and we 497 * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can 498 * simply build the new inode core with a random generation number. 499 * 500 * For version 4 (and older) superblocks, log recovery is dependent on 501 * the di_flushiter field being initialised from the current on-disk 502 * value and hence we must also read the inode off disk even when 503 * initializing new inodes. 504 */ 505 if (xfs_sb_version_has_v3inode(&mp->m_sb) && 506 (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { 507 VFS_I(ip)->i_generation = prandom_u32(); 508 } else { 509 struct xfs_dinode *dip; 510 struct xfs_buf *bp; 511 512 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); 513 if (error) 514 goto out_destroy; 515 516 error = xfs_inode_from_disk(ip, dip); 517 if (!error) 518 xfs_buf_set_ref(bp, XFS_INO_REF); 519 xfs_trans_brelse(tp, bp); 520 521 if (error) 522 goto out_destroy; 523 } 524 525 trace_xfs_iget_miss(ip); 526 527 /* 528 * Check the inode free state is valid. This also detects lookup 529 * racing with unlinks. 530 */ 531 error = xfs_iget_check_free_state(ip, flags); 532 if (error) 533 goto out_destroy; 534 535 /* 536 * Preload the radix tree so we can insert safely under the 537 * write spinlock. Note that we cannot sleep inside the preload 538 * region. Since we can be called from transaction context, don't 539 * recurse into the file system. 540 */ 541 if (radix_tree_preload(GFP_NOFS)) { 542 error = -EAGAIN; 543 goto out_destroy; 544 } 545 546 /* 547 * Because the inode hasn't been added to the radix-tree yet it can't 548 * be found by another thread, so we can do the non-sleeping lock here. 549 */ 550 if (lock_flags) { 551 if (!xfs_ilock_nowait(ip, lock_flags)) 552 BUG(); 553 } 554 555 /* 556 * These values must be set before inserting the inode into the radix 557 * tree as the moment it is inserted a concurrent lookup (allowed by the 558 * RCU locking mechanism) can find it and that lookup must see that this 559 * is an inode currently under construction (i.e. that XFS_INEW is set). 560 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 561 * memory barrier that ensures this detection works correctly at lookup 562 * time. 563 */ 564 iflags = XFS_INEW; 565 if (flags & XFS_IGET_DONTCACHE) 566 d_mark_dontcache(VFS_I(ip)); 567 ip->i_udquot = NULL; 568 ip->i_gdquot = NULL; 569 ip->i_pdquot = NULL; 570 xfs_iflags_set(ip, iflags); 571 572 /* insert the new inode */ 573 spin_lock(&pag->pag_ici_lock); 574 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 575 if (unlikely(error)) { 576 WARN_ON(error != -EEXIST); 577 XFS_STATS_INC(mp, xs_ig_dup); 578 error = -EAGAIN; 579 goto out_preload_end; 580 } 581 spin_unlock(&pag->pag_ici_lock); 582 radix_tree_preload_end(); 583 584 *ipp = ip; 585 return 0; 586 587out_preload_end: 588 spin_unlock(&pag->pag_ici_lock); 589 radix_tree_preload_end(); 590 if (lock_flags) 591 xfs_iunlock(ip, lock_flags); 592out_destroy: 593 __destroy_inode(VFS_I(ip)); 594 xfs_inode_free(ip); 595 return error; 596} 597 598/* 599 * Look up an inode by number in the given file system. The inode is looked up 600 * in the cache held in each AG. If the inode is found in the cache, initialise 601 * the vfs inode if necessary. 602 * 603 * If it is not in core, read it in from the file system's device, add it to the 604 * cache and initialise the vfs inode. 605 * 606 * The inode is locked according to the value of the lock_flags parameter. 607 * Inode lookup is only done during metadata operations and not as part of the 608 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 609 */ 610int 611xfs_iget( 612 struct xfs_mount *mp, 613 struct xfs_trans *tp, 614 xfs_ino_t ino, 615 uint flags, 616 uint lock_flags, 617 struct xfs_inode **ipp) 618{ 619 struct xfs_inode *ip; 620 struct xfs_perag *pag; 621 xfs_agino_t agino; 622 int error; 623 624 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 625 626 /* reject inode numbers outside existing AGs */ 627 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 628 return -EINVAL; 629 630 XFS_STATS_INC(mp, xs_ig_attempts); 631 632 /* get the perag structure and ensure that it's inode capable */ 633 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 634 agino = XFS_INO_TO_AGINO(mp, ino); 635 636again: 637 error = 0; 638 rcu_read_lock(); 639 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 640 641 if (ip) { 642 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 643 if (error) 644 goto out_error_or_again; 645 } else { 646 rcu_read_unlock(); 647 if (flags & XFS_IGET_INCORE) { 648 error = -ENODATA; 649 goto out_error_or_again; 650 } 651 XFS_STATS_INC(mp, xs_ig_missed); 652 653 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 654 flags, lock_flags); 655 if (error) 656 goto out_error_or_again; 657 } 658 xfs_perag_put(pag); 659 660 *ipp = ip; 661 662 /* 663 * If we have a real type for an on-disk inode, we can setup the inode 664 * now. If it's a new inode being created, xfs_ialloc will handle it. 665 */ 666 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 667 xfs_setup_existing_inode(ip); 668 return 0; 669 670out_error_or_again: 671 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 672 delay(1); 673 goto again; 674 } 675 xfs_perag_put(pag); 676 return error; 677} 678 679/* 680 * "Is this a cached inode that's also allocated?" 681 * 682 * Look up an inode by number in the given file system. If the inode is 683 * in cache and isn't in purgatory, return 1 if the inode is allocated 684 * and 0 if it is not. For all other cases (not in cache, being torn 685 * down, etc.), return a negative error code. 686 * 687 * The caller has to prevent inode allocation and freeing activity, 688 * presumably by locking the AGI buffer. This is to ensure that an 689 * inode cannot transition from allocated to freed until the caller is 690 * ready to allow that. If the inode is in an intermediate state (new, 691 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 692 * inode is not in the cache, -ENOENT will be returned. The caller must 693 * deal with these scenarios appropriately. 694 * 695 * This is a specialized use case for the online scrubber; if you're 696 * reading this, you probably want xfs_iget. 697 */ 698int 699xfs_icache_inode_is_allocated( 700 struct xfs_mount *mp, 701 struct xfs_trans *tp, 702 xfs_ino_t ino, 703 bool *inuse) 704{ 705 struct xfs_inode *ip; 706 int error; 707 708 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 709 if (error) 710 return error; 711 712 *inuse = !!(VFS_I(ip)->i_mode); 713 xfs_irele(ip); 714 return 0; 715} 716 717/* 718 * The inode lookup is done in batches to keep the amount of lock traffic and 719 * radix tree lookups to a minimum. The batch size is a trade off between 720 * lookup reduction and stack usage. This is in the reclaim path, so we can't 721 * be too greedy. 722 */ 723#define XFS_LOOKUP_BATCH 32 724 725/* 726 * Decide if the given @ip is eligible to be a part of the inode walk, and 727 * grab it if so. Returns true if it's ready to go or false if we should just 728 * ignore it. 729 */ 730STATIC bool 731xfs_inode_walk_ag_grab( 732 struct xfs_inode *ip, 733 int flags) 734{ 735 struct inode *inode = VFS_I(ip); 736 bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); 737 738 ASSERT(rcu_read_lock_held()); 739 740 /* Check for stale RCU freed inode */ 741 spin_lock(&ip->i_flags_lock); 742 if (!ip->i_ino) 743 goto out_unlock_noent; 744 745 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 746 if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 747 __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 748 goto out_unlock_noent; 749 spin_unlock(&ip->i_flags_lock); 750 751 /* nothing to sync during shutdown */ 752 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 753 return false; 754 755 /* If we can't grab the inode, it must on it's way to reclaim. */ 756 if (!igrab(inode)) 757 return false; 758 759 /* inode is valid */ 760 return true; 761 762out_unlock_noent: 763 spin_unlock(&ip->i_flags_lock); 764 return false; 765} 766 767/* 768 * For a given per-AG structure @pag, grab, @execute, and rele all incore 769 * inodes with the given radix tree @tag. 770 */ 771STATIC int 772xfs_inode_walk_ag( 773 struct xfs_perag *pag, 774 int iter_flags, 775 int (*execute)(struct xfs_inode *ip, void *args), 776 void *args, 777 int tag) 778{ 779 struct xfs_mount *mp = pag->pag_mount; 780 uint32_t first_index; 781 int last_error = 0; 782 int skipped; 783 bool done; 784 int nr_found; 785 786restart: 787 done = false; 788 skipped = 0; 789 first_index = 0; 790 nr_found = 0; 791 do { 792 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 793 int error = 0; 794 int i; 795 796 rcu_read_lock(); 797 798 if (tag == XFS_ICI_NO_TAG) 799 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 800 (void **)batch, first_index, 801 XFS_LOOKUP_BATCH); 802 else 803 nr_found = radix_tree_gang_lookup_tag( 804 &pag->pag_ici_root, 805 (void **) batch, first_index, 806 XFS_LOOKUP_BATCH, tag); 807 808 if (!nr_found) { 809 rcu_read_unlock(); 810 break; 811 } 812 813 /* 814 * Grab the inodes before we drop the lock. if we found 815 * nothing, nr == 0 and the loop will be skipped. 816 */ 817 for (i = 0; i < nr_found; i++) { 818 struct xfs_inode *ip = batch[i]; 819 820 if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) 821 batch[i] = NULL; 822 823 /* 824 * Update the index for the next lookup. Catch 825 * overflows into the next AG range which can occur if 826 * we have inodes in the last block of the AG and we 827 * are currently pointing to the last inode. 828 * 829 * Because we may see inodes that are from the wrong AG 830 * due to RCU freeing and reallocation, only update the 831 * index if it lies in this AG. It was a race that lead 832 * us to see this inode, so another lookup from the 833 * same index will not find it again. 834 */ 835 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 836 continue; 837 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 838 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 839 done = true; 840 } 841 842 /* unlock now we've grabbed the inodes. */ 843 rcu_read_unlock(); 844 845 for (i = 0; i < nr_found; i++) { 846 if (!batch[i]) 847 continue; 848 if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && 849 xfs_iflags_test(batch[i], XFS_INEW)) 850 xfs_inew_wait(batch[i]); 851 error = execute(batch[i], args); 852 xfs_irele(batch[i]); 853 if (error == -EAGAIN) { 854 skipped++; 855 continue; 856 } 857 if (error && last_error != -EFSCORRUPTED) 858 last_error = error; 859 } 860 861 /* bail out if the filesystem is corrupted. */ 862 if (error == -EFSCORRUPTED) 863 break; 864 865 cond_resched(); 866 867 } while (nr_found && !done); 868 869 if (skipped) { 870 delay(1); 871 goto restart; 872 } 873 return last_error; 874} 875 876/* Fetch the next (possibly tagged) per-AG structure. */ 877static inline struct xfs_perag * 878xfs_inode_walk_get_perag( 879 struct xfs_mount *mp, 880 xfs_agnumber_t agno, 881 int tag) 882{ 883 if (tag == XFS_ICI_NO_TAG) 884 return xfs_perag_get(mp, agno); 885 return xfs_perag_get_tag(mp, agno, tag); 886} 887 888/* 889 * Call the @execute function on all incore inodes matching the radix tree 890 * @tag. 891 */ 892int 893xfs_inode_walk( 894 struct xfs_mount *mp, 895 int iter_flags, 896 int (*execute)(struct xfs_inode *ip, void *args), 897 void *args, 898 int tag) 899{ 900 struct xfs_perag *pag; 901 int error = 0; 902 int last_error = 0; 903 xfs_agnumber_t ag; 904 905 ag = 0; 906 while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { 907 ag = pag->pag_agno + 1; 908 error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); 909 xfs_perag_put(pag); 910 if (error) { 911 last_error = error; 912 if (error == -EFSCORRUPTED) 913 break; 914 } 915 } 916 return last_error; 917} 918 919/* 920 * Background scanning to trim post-EOF preallocated space. This is queued 921 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 922 */ 923void 924xfs_queue_eofblocks( 925 struct xfs_mount *mp) 926{ 927 rcu_read_lock(); 928 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 929 queue_delayed_work(mp->m_eofblocks_workqueue, 930 &mp->m_eofblocks_work, 931 msecs_to_jiffies(xfs_eofb_secs * 1000)); 932 rcu_read_unlock(); 933} 934 935void 936xfs_eofblocks_worker( 937 struct work_struct *work) 938{ 939 struct xfs_mount *mp = container_of(to_delayed_work(work), 940 struct xfs_mount, m_eofblocks_work); 941 942 if (!sb_start_write_trylock(mp->m_super)) 943 return; 944 xfs_icache_free_eofblocks(mp, NULL); 945 sb_end_write(mp->m_super); 946 947 xfs_queue_eofblocks(mp); 948} 949 950/* 951 * Background scanning to trim preallocated CoW space. This is queued 952 * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 953 * (We'll just piggyback on the post-EOF prealloc space workqueue.) 954 */ 955void 956xfs_queue_cowblocks( 957 struct xfs_mount *mp) 958{ 959 rcu_read_lock(); 960 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 961 queue_delayed_work(mp->m_eofblocks_workqueue, 962 &mp->m_cowblocks_work, 963 msecs_to_jiffies(xfs_cowb_secs * 1000)); 964 rcu_read_unlock(); 965} 966 967void 968xfs_cowblocks_worker( 969 struct work_struct *work) 970{ 971 struct xfs_mount *mp = container_of(to_delayed_work(work), 972 struct xfs_mount, m_cowblocks_work); 973 974 if (!sb_start_write_trylock(mp->m_super)) 975 return; 976 xfs_icache_free_cowblocks(mp, NULL); 977 sb_end_write(mp->m_super); 978 979 xfs_queue_cowblocks(mp); 980} 981 982/* 983 * Grab the inode for reclaim exclusively. 984 * 985 * We have found this inode via a lookup under RCU, so the inode may have 986 * already been freed, or it may be in the process of being recycled by 987 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 988 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 989 * will not be set. Hence we need to check for both these flag conditions to 990 * avoid inodes that are no longer reclaim candidates. 991 * 992 * Note: checking for other state flags here, under the i_flags_lock or not, is 993 * racy and should be avoided. Those races should be resolved only after we have 994 * ensured that we are able to reclaim this inode and the world can see that we 995 * are going to reclaim it. 996 * 997 * Return true if we grabbed it, false otherwise. 998 */ 999static bool 1000xfs_reclaim_inode_grab( 1001 struct xfs_inode *ip) 1002{ 1003 ASSERT(rcu_read_lock_held()); 1004 1005 spin_lock(&ip->i_flags_lock); 1006 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 1007 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 1008 /* not a reclaim candidate. */ 1009 spin_unlock(&ip->i_flags_lock); 1010 return false; 1011 } 1012 __xfs_iflags_set(ip, XFS_IRECLAIM); 1013 spin_unlock(&ip->i_flags_lock); 1014 return true; 1015} 1016 1017/* 1018 * Inode reclaim is non-blocking, so the default action if progress cannot be 1019 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 1020 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 1021 * blocking anymore and hence we can wait for the inode to be able to reclaim 1022 * it. 1023 * 1024 * We do no IO here - if callers require inodes to be cleaned they must push the 1025 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 1026 * done in the background in a non-blocking manner, and enables memory reclaim 1027 * to make progress without blocking. 1028 */ 1029static void 1030xfs_reclaim_inode( 1031 struct xfs_inode *ip, 1032 struct xfs_perag *pag) 1033{ 1034 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 1035 1036 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 1037 goto out; 1038 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 1039 goto out_iunlock; 1040 1041 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1042 xfs_iunpin_wait(ip); 1043 xfs_iflush_abort(ip); 1044 goto reclaim; 1045 } 1046 if (xfs_ipincount(ip)) 1047 goto out_clear_flush; 1048 if (!xfs_inode_clean(ip)) 1049 goto out_clear_flush; 1050 1051 xfs_iflags_clear(ip, XFS_IFLUSHING); 1052reclaim: 1053 1054 /* 1055 * Because we use RCU freeing we need to ensure the inode always appears 1056 * to be reclaimed with an invalid inode number when in the free state. 1057 * We do this as early as possible under the ILOCK so that 1058 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1059 * detect races with us here. By doing this, we guarantee that once 1060 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1061 * it will see either a valid inode that will serialise correctly, or it 1062 * will see an invalid inode that it can skip. 1063 */ 1064 spin_lock(&ip->i_flags_lock); 1065 ip->i_flags = XFS_IRECLAIM; 1066 ip->i_ino = 0; 1067 spin_unlock(&ip->i_flags_lock); 1068 1069 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1070 1071 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1072 /* 1073 * Remove the inode from the per-AG radix tree. 1074 * 1075 * Because radix_tree_delete won't complain even if the item was never 1076 * added to the tree assert that it's been there before to catch 1077 * problems with the inode life time early on. 1078 */ 1079 spin_lock(&pag->pag_ici_lock); 1080 if (!radix_tree_delete(&pag->pag_ici_root, 1081 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1082 ASSERT(0); 1083 xfs_perag_clear_reclaim_tag(pag); 1084 spin_unlock(&pag->pag_ici_lock); 1085 1086 /* 1087 * Here we do an (almost) spurious inode lock in order to coordinate 1088 * with inode cache radix tree lookups. This is because the lookup 1089 * can reference the inodes in the cache without taking references. 1090 * 1091 * We make that OK here by ensuring that we wait until the inode is 1092 * unlocked after the lookup before we go ahead and free it. 1093 */ 1094 xfs_ilock(ip, XFS_ILOCK_EXCL); 1095 xfs_qm_dqdetach(ip); 1096 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1097 ASSERT(xfs_inode_clean(ip)); 1098 1099 __xfs_inode_free(ip); 1100 return; 1101 1102out_clear_flush: 1103 xfs_iflags_clear(ip, XFS_IFLUSHING); 1104out_iunlock: 1105 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1106out: 1107 xfs_iflags_clear(ip, XFS_IRECLAIM); 1108} 1109 1110/* 1111 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 1112 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1113 * then a shut down during filesystem unmount reclaim walk leak all the 1114 * unreclaimed inodes. 1115 * 1116 * Returns non-zero if any AGs or inodes were skipped in the reclaim pass 1117 * so that callers that want to block until all dirty inodes are written back 1118 * and reclaimed can sanely loop. 1119 */ 1120static void 1121xfs_reclaim_inodes_ag( 1122 struct xfs_mount *mp, 1123 int *nr_to_scan) 1124{ 1125 struct xfs_perag *pag; 1126 xfs_agnumber_t ag = 0; 1127 1128 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1129 unsigned long first_index = 0; 1130 int done = 0; 1131 int nr_found = 0; 1132 1133 ag = pag->pag_agno + 1; 1134 1135 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1136 do { 1137 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1138 int i; 1139 1140 rcu_read_lock(); 1141 nr_found = radix_tree_gang_lookup_tag( 1142 &pag->pag_ici_root, 1143 (void **)batch, first_index, 1144 XFS_LOOKUP_BATCH, 1145 XFS_ICI_RECLAIM_TAG); 1146 if (!nr_found) { 1147 done = 1; 1148 rcu_read_unlock(); 1149 break; 1150 } 1151 1152 /* 1153 * Grab the inodes before we drop the lock. if we found 1154 * nothing, nr == 0 and the loop will be skipped. 1155 */ 1156 for (i = 0; i < nr_found; i++) { 1157 struct xfs_inode *ip = batch[i]; 1158 1159 if (done || !xfs_reclaim_inode_grab(ip)) 1160 batch[i] = NULL; 1161 1162 /* 1163 * Update the index for the next lookup. Catch 1164 * overflows into the next AG range which can 1165 * occur if we have inodes in the last block of 1166 * the AG and we are currently pointing to the 1167 * last inode. 1168 * 1169 * Because we may see inodes that are from the 1170 * wrong AG due to RCU freeing and 1171 * reallocation, only update the index if it 1172 * lies in this AG. It was a race that lead us 1173 * to see this inode, so another lookup from 1174 * the same index will not find it again. 1175 */ 1176 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1177 pag->pag_agno) 1178 continue; 1179 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1180 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1181 done = 1; 1182 } 1183 1184 /* unlock now we've grabbed the inodes. */ 1185 rcu_read_unlock(); 1186 1187 for (i = 0; i < nr_found; i++) { 1188 if (batch[i]) 1189 xfs_reclaim_inode(batch[i], pag); 1190 } 1191 1192 *nr_to_scan -= XFS_LOOKUP_BATCH; 1193 cond_resched(); 1194 } while (nr_found && !done && *nr_to_scan > 0); 1195 1196 if (done) 1197 first_index = 0; 1198 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1199 xfs_perag_put(pag); 1200 } 1201} 1202 1203void 1204xfs_reclaim_inodes( 1205 struct xfs_mount *mp) 1206{ 1207 int nr_to_scan = INT_MAX; 1208 1209 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 1210 xfs_ail_push_all_sync(mp->m_ail); 1211 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1212 } 1213} 1214 1215/* 1216 * The shrinker infrastructure determines how many inodes we should scan for 1217 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 1218 * push the AIL here. We also want to proactively free up memory if we can to 1219 * minimise the amount of work memory reclaim has to do so we kick the 1220 * background reclaim if it isn't already scheduled. 1221 */ 1222long 1223xfs_reclaim_inodes_nr( 1224 struct xfs_mount *mp, 1225 int nr_to_scan) 1226{ 1227 /* kick background reclaimer and push the AIL */ 1228 xfs_reclaim_work_queue(mp); 1229 xfs_ail_push_all(mp->m_ail); 1230 1231 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1232 return 0; 1233} 1234 1235/* 1236 * Return the number of reclaimable inodes in the filesystem for 1237 * the shrinker to determine how much to reclaim. 1238 */ 1239int 1240xfs_reclaim_inodes_count( 1241 struct xfs_mount *mp) 1242{ 1243 struct xfs_perag *pag; 1244 xfs_agnumber_t ag = 0; 1245 int reclaimable = 0; 1246 1247 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1248 ag = pag->pag_agno + 1; 1249 reclaimable += pag->pag_ici_reclaimable; 1250 xfs_perag_put(pag); 1251 } 1252 return reclaimable; 1253} 1254 1255STATIC bool 1256xfs_inode_match_id( 1257 struct xfs_inode *ip, 1258 struct xfs_eofblocks *eofb) 1259{ 1260 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1261 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1262 return false; 1263 1264 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1265 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1266 return false; 1267 1268 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1269 ip->i_d.di_projid != eofb->eof_prid) 1270 return false; 1271 1272 return true; 1273} 1274 1275/* 1276 * A union-based inode filtering algorithm. Process the inode if any of the 1277 * criteria match. This is for global/internal scans only. 1278 */ 1279STATIC bool 1280xfs_inode_match_id_union( 1281 struct xfs_inode *ip, 1282 struct xfs_eofblocks *eofb) 1283{ 1284 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1285 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1286 return true; 1287 1288 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1289 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1290 return true; 1291 1292 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1293 ip->i_d.di_projid == eofb->eof_prid) 1294 return true; 1295 1296 return false; 1297} 1298 1299/* 1300 * Is this inode @ip eligible for eof/cow block reclamation, given some 1301 * filtering parameters @eofb? The inode is eligible if @eofb is null or 1302 * if the predicate functions match. 1303 */ 1304static bool 1305xfs_inode_matches_eofb( 1306 struct xfs_inode *ip, 1307 struct xfs_eofblocks *eofb) 1308{ 1309 bool match; 1310 1311 if (!eofb) 1312 return true; 1313 1314 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1315 match = xfs_inode_match_id_union(ip, eofb); 1316 else 1317 match = xfs_inode_match_id(ip, eofb); 1318 if (!match) 1319 return false; 1320 1321 /* skip the inode if the file size is too small */ 1322 if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && 1323 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1324 return false; 1325 1326 return true; 1327} 1328 1329/* 1330 * This is a fast pass over the inode cache to try to get reclaim moving on as 1331 * many inodes as possible in a short period of time. It kicks itself every few 1332 * seconds, as well as being kicked by the inode cache shrinker when memory 1333 * goes low. 1334 */ 1335void 1336xfs_reclaim_worker( 1337 struct work_struct *work) 1338{ 1339 struct xfs_mount *mp = container_of(to_delayed_work(work), 1340 struct xfs_mount, m_reclaim_work); 1341 int nr_to_scan = INT_MAX; 1342 1343 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1344 xfs_reclaim_work_queue(mp); 1345} 1346 1347STATIC int 1348xfs_inode_free_eofblocks( 1349 struct xfs_inode *ip, 1350 void *args) 1351{ 1352 struct xfs_eofblocks *eofb = args; 1353 bool wait; 1354 int ret; 1355 1356 wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); 1357 1358 if (!xfs_can_free_eofblocks(ip, false)) { 1359 /* inode could be preallocated or append-only */ 1360 trace_xfs_inode_free_eofblocks_invalid(ip); 1361 xfs_inode_clear_eofblocks_tag(ip); 1362 return 0; 1363 } 1364 1365 /* 1366 * If the mapping is dirty the operation can block and wait for some 1367 * time. Unless we are waiting, skip it. 1368 */ 1369 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1370 return 0; 1371 1372 if (!xfs_inode_matches_eofb(ip, eofb)) 1373 return 0; 1374 1375 /* 1376 * If the caller is waiting, return -EAGAIN to keep the background 1377 * scanner moving and revisit the inode in a subsequent pass. 1378 */ 1379 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1380 if (wait) 1381 return -EAGAIN; 1382 return 0; 1383 } 1384 1385 ret = xfs_free_eofblocks(ip); 1386 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1387 1388 return ret; 1389} 1390 1391int 1392xfs_icache_free_eofblocks( 1393 struct xfs_mount *mp, 1394 struct xfs_eofblocks *eofb) 1395{ 1396 return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, 1397 XFS_ICI_EOFBLOCKS_TAG); 1398} 1399 1400/* 1401 * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1402 * multiple quotas, we don't know exactly which quota caused an allocation 1403 * failure. We make a best effort by including each quota under low free space 1404 * conditions (less than 1% free space) in the scan. 1405 */ 1406static int 1407__xfs_inode_free_quota_eofblocks( 1408 struct xfs_inode *ip, 1409 int (*execute)(struct xfs_mount *mp, 1410 struct xfs_eofblocks *eofb)) 1411{ 1412 int scan = 0; 1413 struct xfs_eofblocks eofb = {0}; 1414 struct xfs_dquot *dq; 1415 1416 /* 1417 * Run a sync scan to increase effectiveness and use the union filter to 1418 * cover all applicable quotas in a single scan. 1419 */ 1420 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1421 1422 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1423 dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); 1424 if (dq && xfs_dquot_lowsp(dq)) { 1425 eofb.eof_uid = VFS_I(ip)->i_uid; 1426 eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1427 scan = 1; 1428 } 1429 } 1430 1431 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1432 dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); 1433 if (dq && xfs_dquot_lowsp(dq)) { 1434 eofb.eof_gid = VFS_I(ip)->i_gid; 1435 eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1436 scan = 1; 1437 } 1438 } 1439 1440 if (scan) 1441 execute(ip->i_mount, &eofb); 1442 1443 return scan; 1444} 1445 1446int 1447xfs_inode_free_quota_eofblocks( 1448 struct xfs_inode *ip) 1449{ 1450 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 1451} 1452 1453static inline unsigned long 1454xfs_iflag_for_tag( 1455 int tag) 1456{ 1457 switch (tag) { 1458 case XFS_ICI_EOFBLOCKS_TAG: 1459 return XFS_IEOFBLOCKS; 1460 case XFS_ICI_COWBLOCKS_TAG: 1461 return XFS_ICOWBLOCKS; 1462 default: 1463 ASSERT(0); 1464 return 0; 1465 } 1466} 1467 1468static void 1469__xfs_inode_set_blocks_tag( 1470 xfs_inode_t *ip, 1471 void (*execute)(struct xfs_mount *mp), 1472 void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1473 int error, unsigned long caller_ip), 1474 int tag) 1475{ 1476 struct xfs_mount *mp = ip->i_mount; 1477 struct xfs_perag *pag; 1478 int tagged; 1479 1480 /* 1481 * Don't bother locking the AG and looking up in the radix trees 1482 * if we already know that we have the tag set. 1483 */ 1484 if (ip->i_flags & xfs_iflag_for_tag(tag)) 1485 return; 1486 spin_lock(&ip->i_flags_lock); 1487 ip->i_flags |= xfs_iflag_for_tag(tag); 1488 spin_unlock(&ip->i_flags_lock); 1489 1490 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1491 spin_lock(&pag->pag_ici_lock); 1492 1493 tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 1494 radix_tree_tag_set(&pag->pag_ici_root, 1495 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1496 if (!tagged) { 1497 /* propagate the eofblocks tag up into the perag radix tree */ 1498 spin_lock(&ip->i_mount->m_perag_lock); 1499 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 1500 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1501 tag); 1502 spin_unlock(&ip->i_mount->m_perag_lock); 1503 1504 /* kick off background trimming */ 1505 execute(ip->i_mount); 1506 1507 set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1508 } 1509 1510 spin_unlock(&pag->pag_ici_lock); 1511 xfs_perag_put(pag); 1512} 1513 1514void 1515xfs_inode_set_eofblocks_tag( 1516 xfs_inode_t *ip) 1517{ 1518 trace_xfs_inode_set_eofblocks_tag(ip); 1519 return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 1520 trace_xfs_perag_set_eofblocks, 1521 XFS_ICI_EOFBLOCKS_TAG); 1522} 1523 1524static void 1525__xfs_inode_clear_blocks_tag( 1526 xfs_inode_t *ip, 1527 void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1528 int error, unsigned long caller_ip), 1529 int tag) 1530{ 1531 struct xfs_mount *mp = ip->i_mount; 1532 struct xfs_perag *pag; 1533 1534 spin_lock(&ip->i_flags_lock); 1535 ip->i_flags &= ~xfs_iflag_for_tag(tag); 1536 spin_unlock(&ip->i_flags_lock); 1537 1538 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1539 spin_lock(&pag->pag_ici_lock); 1540 1541 radix_tree_tag_clear(&pag->pag_ici_root, 1542 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1543 if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 1544 /* clear the eofblocks tag from the perag radix tree */ 1545 spin_lock(&ip->i_mount->m_perag_lock); 1546 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 1547 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1548 tag); 1549 spin_unlock(&ip->i_mount->m_perag_lock); 1550 clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1551 } 1552 1553 spin_unlock(&pag->pag_ici_lock); 1554 xfs_perag_put(pag); 1555} 1556 1557void 1558xfs_inode_clear_eofblocks_tag( 1559 xfs_inode_t *ip) 1560{ 1561 trace_xfs_inode_clear_eofblocks_tag(ip); 1562 return __xfs_inode_clear_blocks_tag(ip, 1563 trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 1564} 1565 1566/* 1567 * Set ourselves up to free CoW blocks from this file. If it's already clean 1568 * then we can bail out quickly, but otherwise we must back off if the file 1569 * is undergoing some kind of write. 1570 */ 1571static bool 1572xfs_prep_free_cowblocks( 1573 struct xfs_inode *ip) 1574{ 1575 /* 1576 * Just clear the tag if we have an empty cow fork or none at all. It's 1577 * possible the inode was fully unshared since it was originally tagged. 1578 */ 1579 if (!xfs_inode_has_cow_data(ip)) { 1580 trace_xfs_inode_free_cowblocks_invalid(ip); 1581 xfs_inode_clear_cowblocks_tag(ip); 1582 return false; 1583 } 1584 1585 /* 1586 * If the mapping is dirty or under writeback we cannot touch the 1587 * CoW fork. Leave it alone if we're in the midst of a directio. 1588 */ 1589 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1590 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1591 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1592 atomic_read(&VFS_I(ip)->i_dio_count)) 1593 return false; 1594 1595 return true; 1596} 1597 1598/* 1599 * Automatic CoW Reservation Freeing 1600 * 1601 * These functions automatically garbage collect leftover CoW reservations 1602 * that were made on behalf of a cowextsize hint when we start to run out 1603 * of quota or when the reservations sit around for too long. If the file 1604 * has dirty pages or is undergoing writeback, its CoW reservations will 1605 * be retained. 1606 * 1607 * The actual garbage collection piggybacks off the same code that runs 1608 * the speculative EOF preallocation garbage collector. 1609 */ 1610STATIC int 1611xfs_inode_free_cowblocks( 1612 struct xfs_inode *ip, 1613 void *args) 1614{ 1615 struct xfs_eofblocks *eofb = args; 1616 int ret = 0; 1617 1618 if (!xfs_prep_free_cowblocks(ip)) 1619 return 0; 1620 1621 if (!xfs_inode_matches_eofb(ip, eofb)) 1622 return 0; 1623 1624 /* Free the CoW blocks */ 1625 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1626 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1627 1628 /* 1629 * Check again, nobody else should be able to dirty blocks or change 1630 * the reflink iflag now that we have the first two locks held. 1631 */ 1632 if (xfs_prep_free_cowblocks(ip)) 1633 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1634 1635 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 1636 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1637 1638 return ret; 1639} 1640 1641int 1642xfs_icache_free_cowblocks( 1643 struct xfs_mount *mp, 1644 struct xfs_eofblocks *eofb) 1645{ 1646 return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, 1647 XFS_ICI_COWBLOCKS_TAG); 1648} 1649 1650int 1651xfs_inode_free_quota_cowblocks( 1652 struct xfs_inode *ip) 1653{ 1654 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 1655} 1656 1657void 1658xfs_inode_set_cowblocks_tag( 1659 xfs_inode_t *ip) 1660{ 1661 trace_xfs_inode_set_cowblocks_tag(ip); 1662 return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 1663 trace_xfs_perag_set_cowblocks, 1664 XFS_ICI_COWBLOCKS_TAG); 1665} 1666 1667void 1668xfs_inode_clear_cowblocks_tag( 1669 xfs_inode_t *ip) 1670{ 1671 trace_xfs_inode_clear_cowblocks_tag(ip); 1672 return __xfs_inode_clear_blocks_tag(ip, 1673 trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 1674} 1675 1676/* Disable post-EOF and CoW block auto-reclamation. */ 1677void 1678xfs_stop_block_reaping( 1679 struct xfs_mount *mp) 1680{ 1681 cancel_delayed_work_sync(&mp->m_eofblocks_work); 1682 cancel_delayed_work_sync(&mp->m_cowblocks_work); 1683} 1684 1685/* Enable post-EOF and CoW block auto-reclamation. */ 1686void 1687xfs_start_block_reaping( 1688 struct xfs_mount *mp) 1689{ 1690 xfs_queue_eofblocks(mp); 1691 xfs_queue_cowblocks(mp); 1692} 1693