1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6#include <linux/iversion.h> 7 8#include "xfs.h" 9#include "xfs_fs.h" 10#include "xfs_shared.h" 11#include "xfs_format.h" 12#include "xfs_log_format.h" 13#include "xfs_trans_resv.h" 14#include "xfs_sb.h" 15#include "xfs_mount.h" 16#include "xfs_defer.h" 17#include "xfs_inode.h" 18#include "xfs_dir2.h" 19#include "xfs_attr.h" 20#include "xfs_trans_space.h" 21#include "xfs_trans.h" 22#include "xfs_buf_item.h" 23#include "xfs_inode_item.h" 24#include "xfs_ialloc.h" 25#include "xfs_bmap.h" 26#include "xfs_bmap_util.h" 27#include "xfs_errortag.h" 28#include "xfs_error.h" 29#include "xfs_quota.h" 30#include "xfs_filestream.h" 31#include "xfs_trace.h" 32#include "xfs_icache.h" 33#include "xfs_symlink.h" 34#include "xfs_trans_priv.h" 35#include "xfs_log.h" 36#include "xfs_bmap_btree.h" 37#include "xfs_reflink.h" 38 39kmem_zone_t *xfs_inode_zone; 40 41/* 42 * Used in xfs_itruncate_extents(). This is the maximum number of extents 43 * freed from a file in a single transaction. 44 */ 45#define XFS_ITRUNC_MAX_EXTENTS 2 46 47STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); 48STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); 49 50/* 51 * helper function to extract extent size hint from inode 52 */ 53xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip) 54{ 55 /* 56 * No point in aligning allocations if we need to COW to actually 57 * write to them. 58 */ 59 if (xfs_is_always_cow_inode(ip)) { 60 return 0; 61 } 62 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) { 63 return ip->i_d.di_extsize; 64 } 65 if (XFS_IS_REALTIME_INODE(ip)) { 66 return ip->i_mount->m_sb.sb_rextsize; 67 } 68 return 0; 69} 70 71/* 72 * Helper function to extract CoW extent size hint from inode. 73 * Between the extent size hint and the CoW extent size hint, we 74 * return the greater of the two. If the value is zero (automatic), 75 * use the default size. 76 */ 77xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip) 78{ 79 xfs_extlen_t a, b; 80 81 a = 0; 82 if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { 83 a = ip->i_d.di_cowextsize; 84 } 85 b = xfs_get_extsz_hint(ip); 86 87 a = max(a, b); 88 if (a == 0) { 89 return XFS_DEFAULT_COWEXTSZ_HINT; 90 } 91 return a; 92} 93 94/* 95 * These two are wrapper routines around the xfs_ilock() routine used to 96 * centralize some grungy code. They are used in places that wish to lock the 97 * inode solely for reading the extents. The reason these places can't just 98 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to 99 * bringing in of the extents from disk for a file in b-tree format. If the 100 * inode is in b-tree format, then we need to lock the inode exclusively until 101 * the extents are read in. Locking it exclusively all the time would limit 102 * our parallelism unnecessarily, though. What we do instead is check to see 103 * if the extents have been read in yet, and only lock the inode exclusively 104 * if they have not. 105 * 106 * The functions return a value which should be given to the corresponding 107 * xfs_iunlock() call. 108 */ 109uint xfs_ilock_data_map_shared(struct xfs_inode *ip) 110{ 111 uint lock_mode = XFS_ILOCK_SHARED; 112 113 if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) { 114 lock_mode = XFS_ILOCK_EXCL; 115 } 116 xfs_ilock(ip, lock_mode); 117 return lock_mode; 118} 119 120uint xfs_ilock_attr_map_shared(struct xfs_inode *ip) 121{ 122 uint lock_mode = XFS_ILOCK_SHARED; 123 124 if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) { 125 lock_mode = XFS_ILOCK_EXCL; 126 } 127 xfs_ilock(ip, lock_mode); 128 return lock_mode; 129} 130 131/* 132 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 133 * multi-reader locks: i_mmap_lock and the i_lock. This routine allows 134 * various combinations of the locks to be obtained. 135 * 136 * The 3 locks should always be ordered so that the IO lock is obtained first, 137 * the mmap lock second and the ilock last in order to prevent deadlock. 138 * 139 * Basic locking order 140 * 141 * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock 142 * 143 * mmap_lock locking order 144 * 145 * i_rwsem -> page lock -> mmap_lock 146 * mmap_lock -> i_mmap_lock -> page_lock 147 * 148 * The difference in mmap_lock locking order mean that we cannot hold the 149 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can 150 * fault in pages during copy in/out (for buffered IO) or require the mmap_lock 151 * in get_user_pages() to map the user pages into the kernel address space for 152 * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because 153 * page faults already hold the mmap_lock. 154 * 155 * Hence to serialise fully against both syscall and mmap based IO, we need to 156 * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both 157 * taken in places where we need to invalidate the page cache in a race 158 * free manner (e.g. truncate, hole punch and other extent manipulation 159 * functions). 160 */ 161void xfs_ilock(xfs_inode_t *ip, uint lock_flags) 162{ 163 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 164 165 /* 166 * You can't set both SHARED and EXCL for the same lock, 167 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 168 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 169 */ 170 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 171 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 172 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 173 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 174 175 if (lock_flags & XFS_IOLOCK_EXCL) { 176 down_write_nested(&VFS_I(ip)->i_rwsem, XFS_IOLOCK_DEP(lock_flags)); 177 } else if (lock_flags & XFS_IOLOCK_SHARED) { 178 down_read_nested(&VFS_I(ip)->i_rwsem, XFS_IOLOCK_DEP(lock_flags)); 179 } 180 181 if (lock_flags & XFS_MMAPLOCK_EXCL) { 182 mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); 183 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 184 mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); 185 } 186 187 if (lock_flags & XFS_ILOCK_EXCL) { 188 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 189 } else if (lock_flags & XFS_ILOCK_SHARED) { 190 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 191 } 192} 193 194/* 195 * This is just like xfs_ilock(), except that the caller 196 * is guaranteed not to sleep. It returns 1 if it gets 197 * the requested locks and 0 otherwise. If the IO lock is 198 * obtained but the inode lock cannot be, then the IO lock 199 * is dropped before returning. 200 * 201 * ip -- the inode being locked 202 * lock_flags -- this parameter indicates the inode's locks to be 203 * to be locked. See the comment for xfs_ilock() for a list 204 * of valid values. 205 */ 206int xfs_ilock_nowait(xfs_inode_t *ip, uint lock_flags) 207{ 208 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 209 210 /* 211 * You can't set both SHARED and EXCL for the same lock, 212 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 213 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 214 */ 215 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 216 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 217 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 218 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 219 220 if (lock_flags & XFS_IOLOCK_EXCL) { 221 if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) { 222 goto out; 223 } 224 } else if (lock_flags & XFS_IOLOCK_SHARED) { 225 if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) { 226 goto out; 227 } 228 } 229 230 if (lock_flags & XFS_MMAPLOCK_EXCL) { 231 if (!mrtryupdate(&ip->i_mmaplock)) { 232 goto out_undo_iolock; 233 } 234 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 235 if (!mrtryaccess(&ip->i_mmaplock)) { 236 goto out_undo_iolock; 237 } 238 } 239 240 if (lock_flags & XFS_ILOCK_EXCL) { 241 if (!mrtryupdate(&ip->i_lock)) { 242 goto out_undo_mmaplock; 243 } 244 } else if (lock_flags & XFS_ILOCK_SHARED) { 245 if (!mrtryaccess(&ip->i_lock)) { 246 goto out_undo_mmaplock; 247 } 248 } 249 return 1; 250 251out_undo_mmaplock: 252 if (lock_flags & XFS_MMAPLOCK_EXCL) { 253 mrunlock_excl(&ip->i_mmaplock); 254 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 255 mrunlock_shared(&ip->i_mmaplock); 256 } 257out_undo_iolock: 258 if (lock_flags & XFS_IOLOCK_EXCL) { 259 up_write(&VFS_I(ip)->i_rwsem); 260 } else if (lock_flags & XFS_IOLOCK_SHARED) { 261 up_read(&VFS_I(ip)->i_rwsem); 262 } 263out: 264 return 0; 265} 266 267/* 268 * xfs_iunlock() is used to drop the inode locks acquired with 269 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 270 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 271 * that we know which locks to drop. 272 * 273 * ip -- the inode being unlocked 274 * lock_flags -- this parameter indicates the inode's locks to be 275 * to be unlocked. See the comment for xfs_ilock() for a list 276 * of valid values for this parameter. 277 * 278 */ 279void xfs_iunlock(xfs_inode_t *ip, uint lock_flags) 280{ 281 /* 282 * You can't set both SHARED and EXCL for the same lock, 283 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 284 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 285 */ 286 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 287 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 288 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 289 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 290 ASSERT(lock_flags != 0); 291 292 if (lock_flags & XFS_IOLOCK_EXCL) { 293 up_write(&VFS_I(ip)->i_rwsem); 294 } else if (lock_flags & XFS_IOLOCK_SHARED) { 295 up_read(&VFS_I(ip)->i_rwsem); 296 } 297 298 if (lock_flags & XFS_MMAPLOCK_EXCL) { 299 mrunlock_excl(&ip->i_mmaplock); 300 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 301 mrunlock_shared(&ip->i_mmaplock); 302 } 303 304 if (lock_flags & XFS_ILOCK_EXCL) { 305 mrunlock_excl(&ip->i_lock); 306 } else if (lock_flags & XFS_ILOCK_SHARED) { 307 mrunlock_shared(&ip->i_lock); 308 } 309 310 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 311} 312 313/* 314 * give up write locks. the i/o lock cannot be held nested 315 * if it is being demoted. 316 */ 317void xfs_ilock_demote(xfs_inode_t *ip, uint lock_flags) 318{ 319 ASSERT(lock_flags & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)); 320 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)) == 0); 321 322 if (lock_flags & XFS_ILOCK_EXCL) { 323 mrdemote(&ip->i_lock); 324 } 325 if (lock_flags & XFS_MMAPLOCK_EXCL) { 326 mrdemote(&ip->i_mmaplock); 327 } 328 if (lock_flags & XFS_IOLOCK_EXCL) { 329 downgrade_write(&VFS_I(ip)->i_rwsem); 330 } 331 332 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 333} 334 335#if defined(DEBUG) || defined(XFS_WARN) 336int xfs_isilocked(xfs_inode_t *ip, uint lock_flags) 337{ 338 if (lock_flags & (XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)) { 339 if (!(lock_flags & XFS_ILOCK_SHARED)) { 340 return !!ip->i_lock.mr_writer; 341 } 342 return rwsem_is_locked(&ip->i_lock.mr_lock); 343 } 344 345 if (lock_flags & (XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)) { 346 if (!(lock_flags & XFS_MMAPLOCK_SHARED)) { 347 return !!ip->i_mmaplock.mr_writer; 348 } 349 return rwsem_is_locked(&ip->i_mmaplock.mr_lock); 350 } 351 352 if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { 353 if (!(lock_flags & XFS_IOLOCK_SHARED)) { 354 return !debug_locks || lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); 355 } 356 return rwsem_is_locked(&VFS_I(ip)->i_rwsem); 357 } 358 359 ASSERT(0); 360 return 0; 361} 362#endif 363 364/* 365 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when 366 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined 367 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build 368 * errors and warnings. 369 */ 370#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) 371static bool xfs_lockdep_subclass_ok(int subclass) 372{ 373 return subclass < MAX_LOCKDEP_SUBCLASSES; 374} 375#else 376#define xfs_lockdep_subclass_ok(subclass) (true) 377#endif 378 379/* 380 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different 381 * value. This can be called for any type of inode lock combination, including 382 * parent locking. Care must be taken to ensure we don't overrun the subclass 383 * storage fields in the class mask we build. 384 */ 385static inline int xfs_lock_inumorder(int lock_mode, int subclass) 386{ 387 int class = 0; 388 389 ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | XFS_ILOCK_RTSUM))); 390 ASSERT(xfs_lockdep_subclass_ok(subclass)); 391 392 if (lock_mode & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { 393 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 394 class += subclass << XFS_IOLOCK_SHIFT; 395 } 396 397 if (lock_mode & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) { 398 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); 399 class += subclass << XFS_MMAPLOCK_SHIFT; 400 } 401 402 if (lock_mode & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) { 403 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); 404 class += subclass << XFS_ILOCK_SHIFT; 405 } 406 407 return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; 408} 409 410/* 411 * The following routine will lock n inodes in exclusive mode. We assume the 412 * caller calls us with the inodes in i_ino order. 413 * 414 * We need to detect deadlock where an inode that we lock is in the AIL and we 415 * start waiting for another inode that is locked by a thread in a long running 416 * transaction (such as truncate). This can result in deadlock since the long 417 * running trans might need to wait for the inode we just locked in order to 418 * push the tail and free space in the log. 419 * 420 * xfs_lock_inodes() can only be used to lock one type of lock at a time - 421 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we 422 * lock more than one at a time, lockdep will report false positives saying we 423 * have violated locking orders. 424 */ 425static void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode) 426{ 427 int attempts = 0, i, j, try_lock; 428 struct xfs_log_item *lp; 429 430 /* 431 * Currently supports between 2 and 5 inodes with exclusive locking. We 432 * support an arbitrary depth of locking here, but absolute limits on 433 * inodes depend on the type of locking and the limits placed by 434 * lockdep annotations in xfs_lock_inumorder. These are all checked by 435 * the asserts. 436 */ 437 ASSERT(ips && inodes >= 0x2 && inodes <= 0x5); 438 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)); 439 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED))); 440 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 441 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); 442 443 if (lock_mode & XFS_IOLOCK_EXCL) { 444 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); 445 } else if (lock_mode & XFS_MMAPLOCK_EXCL) { 446 ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); 447 } 448 449 try_lock = 0; 450 i = 0; 451 while (1) { 452 for (; i < inodes; i++) { 453 ASSERT(ips[i]); 454 455 if (i && (ips[i] == ips[i - 1])) { /* Already locked */ 456 continue; 457 } 458 459 /* 460 * If try_lock is not set yet, make sure all locked inodes are 461 * not in the AIL. If any are, set try_lock to be used later. 462 */ 463 if (!try_lock) { 464 for (j = (i - 1); j >= 0 && !try_lock; j--) { 465 lp = &ips[j]->i_itemp->ili_item; 466 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 467 try_lock++; 468 } 469 } 470 } 471 472 /* 473 * If any of the previous locks we have locked is in the AIL, 474 * we must TRY to get the second and subsequent locks. If 475 * we can't get any, we must release all we have 476 * and try again. 477 */ 478 if (!try_lock) { 479 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 480 continue; 481 } 482 483 /* try_lock means we have an inode locked that is in the AIL. */ 484 ASSERT(i != 0); 485 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { 486 continue; 487 } 488 489 /* 490 * Unlock all previous guys and try again. xfs_iunlock will try 491 * to push the tail if the inode is in the AIL. 492 */ 493 attempts++; 494 for (j = i - 1; j >= 0; j--) { 495 /* 496 * Check to see if we've already unlocked this one. Not 497 * the first one going back, and the inode ptr is the 498 * same. 499 */ 500 if (j != (i - 1) && ips[j] == ips[j + 1]) { 501 continue; 502 } 503 504 xfs_iunlock(ips[j], lock_mode); 505 } 506 507 if ((attempts % 0x5) == 0) { 508 delay(1); /* Don't just spin the CPU */ 509 } 510 i = 0; 511 try_lock = 0; 512 continue; 513 } 514 break; 515 } 516} 517 518/* 519 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - 520 * the mmaplock or the ilock, but not more than one type at a time. If we lock 521 * more than one at a time, lockdep will report false positives saying we have 522 * violated locking orders. The iolock must be double-locked separately since 523 * we use i_rwsem for that. We now support taking one lock EXCL and the other 524 * SHARED. 525 */ 526void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode) 527{ 528 struct xfs_inode *temp; 529 uint mode_temp; 530 int attempts = 0; 531 struct xfs_log_item *lp; 532 533 ASSERT(hweight32(ip0_mode) == 1); 534 ASSERT(hweight32(ip1_mode) == 1); 535 ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL))); 536 ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL))); 537 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) || 538 !(ip0_mode & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); 539 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) || 540 !(ip1_mode & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); 541 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) || 542 !(ip0_mode & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); 543 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) || 544 !(ip1_mode & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); 545 546 ASSERT(ip0->i_ino != ip1->i_ino); 547 548 if (ip0->i_ino > ip1->i_ino) { 549 temp = ip0; 550 ip0 = ip1; 551 ip1 = temp; 552 mode_temp = ip0_mode; 553 ip0_mode = ip1_mode; 554 ip1_mode = mode_temp; 555 } 556 557again: 558 while (1) { 559 xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); 560 561 /* 562 * If the first lock we have locked is in the AIL, we must TRY to get 563 * the second lock. If we can't get it, we must release the first one 564 * and try again. 565 */ 566 lp = &ip0->i_itemp->ili_item; 567 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 568 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { 569 xfs_iunlock(ip0, ip0_mode); 570 if ((++attempts % 0x5) == 0) { 571 delay(1); /* Don't just spin the CPU */ 572 } 573 continue; 574 } 575 } else { 576 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); 577 } 578 break; 579 } 580} 581 582STATIC uint _xfs_dic2xflags(uint16_t di_flags, uint64_t di_flags2, bool has_attr) 583{ 584 uint flags = 0; 585 586 if (di_flags & XFS_DIFLAG_ANY) { 587 if (di_flags & XFS_DIFLAG_REALTIME) { 588 flags |= FS_XFLAG_REALTIME; 589 } 590 if (di_flags & XFS_DIFLAG_PREALLOC) { 591 flags |= FS_XFLAG_PREALLOC; 592 } 593 if (di_flags & XFS_DIFLAG_IMMUTABLE) { 594 flags |= FS_XFLAG_IMMUTABLE; 595 } 596 if (di_flags & XFS_DIFLAG_APPEND) { 597 flags |= FS_XFLAG_APPEND; 598 } 599 if (di_flags & XFS_DIFLAG_SYNC) { 600 flags |= FS_XFLAG_SYNC; 601 } 602 if (di_flags & XFS_DIFLAG_NOATIME) { 603 flags |= FS_XFLAG_NOATIME; 604 } 605 if (di_flags & XFS_DIFLAG_NODUMP) { 606 flags |= FS_XFLAG_NODUMP; 607 } 608 if (di_flags & XFS_DIFLAG_RTINHERIT) { 609 flags |= FS_XFLAG_RTINHERIT; 610 } 611 if (di_flags & XFS_DIFLAG_PROJINHERIT) { 612 flags |= FS_XFLAG_PROJINHERIT; 613 } 614 if (di_flags & XFS_DIFLAG_NOSYMLINKS) { 615 flags |= FS_XFLAG_NOSYMLINKS; 616 } 617 if (di_flags & XFS_DIFLAG_EXTSIZE) { 618 flags |= FS_XFLAG_EXTSIZE; 619 } 620 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) { 621 flags |= FS_XFLAG_EXTSZINHERIT; 622 } 623 if (di_flags & XFS_DIFLAG_NODEFRAG) { 624 flags |= FS_XFLAG_NODEFRAG; 625 } 626 if (di_flags & XFS_DIFLAG_FILESTREAM) { 627 flags |= FS_XFLAG_FILESTREAM; 628 } 629 } 630 631 if (di_flags2 & XFS_DIFLAG2_ANY) { 632 if (di_flags2 & XFS_DIFLAG2_DAX) { 633 flags |= FS_XFLAG_DAX; 634 } 635 if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { 636 flags |= FS_XFLAG_COWEXTSIZE; 637 } 638 } 639 640 if (has_attr) { 641 flags |= FS_XFLAG_HASATTR; 642 } 643 644 return flags; 645} 646 647uint xfs_ip2xflags(struct xfs_inode *ip) 648{ 649 struct xfs_icdinode *dic = &ip->i_d; 650 651 return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); 652} 653 654/* 655 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 656 * is allowed, otherwise it has to be an exact match. If a CI match is found, 657 * ci_name->name will point to a the actual name (caller must free) or 658 * will be set to NULL if an exact match is found. 659 */ 660int xfs_lookup(xfs_inode_t *dp, struct xfs_name *name, xfs_inode_t **ipp, struct xfs_name *ci_name) 661{ 662 xfs_ino_t inum; 663 int error; 664 665 trace_xfs_lookup(dp, name); 666 667 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) { 668 return -EIO; 669 } 670 671 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 672 if (error) { 673 goto out_unlock; 674 } 675 676 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 677 if (error) { 678 goto out_free_name; 679 } 680 681 return 0; 682 683out_free_name: 684 if (ci_name) { 685 kmem_free(ci_name->name); 686 } 687out_unlock: 688 *ipp = NULL; 689 return error; 690} 691 692/* Propagate di_flags from a parent inode to a child inode. */ 693static void xfs_inode_inherit_flags(struct xfs_inode *ip, const struct xfs_inode *pip) 694{ 695 unsigned int di_flags = 0; 696 umode_t mode = VFS_I(ip)->i_mode; 697 if (S_ISDIR(mode)) { 698 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { 699 di_flags |= XFS_DIFLAG_RTINHERIT; 700 } 701 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 702 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 703 ip->i_d.di_extsize = pip->i_d.di_extsize; 704 } 705 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) { 706 di_flags |= XFS_DIFLAG_PROJINHERIT; 707 } 708 } else if (S_ISREG(mode)) { 709 if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) && xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) { 710 di_flags |= XFS_DIFLAG_REALTIME; 711 } 712 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 713 di_flags |= XFS_DIFLAG_EXTSIZE; 714 ip->i_d.di_extsize = pip->i_d.di_extsize; 715 } 716 } 717 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && xfs_inherit_noatime) { 718 di_flags |= XFS_DIFLAG_NOATIME; 719 } 720 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && xfs_inherit_nodump) { 721 di_flags |= XFS_DIFLAG_NODUMP; 722 } 723 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && xfs_inherit_sync) { 724 di_flags |= XFS_DIFLAG_SYNC; 725 } 726 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) { 727 di_flags |= XFS_DIFLAG_NOSYMLINKS; 728 } 729 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) { 730 di_flags |= XFS_DIFLAG_NODEFRAG; 731 } 732 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) { 733 di_flags |= XFS_DIFLAG_FILESTREAM; 734 } 735 736 ip->i_d.di_flags |= di_flags; 737} 738 739/* Propagate di_flags2 from a parent inode to a child inode. */ 740static void xfs_inode_inherit_flags2(struct xfs_inode *ip, const struct xfs_inode *pip) 741{ 742 if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { 743 ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 744 ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; 745 } 746 if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) { 747 ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; 748 } 749} 750 751/* 752 * Allocate an inode on disk and return a copy of its in-core version. 753 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 754 * appropriately within the inode. The uid and gid for the inode are 755 * set according to the contents of the given cred structure. 756 * 757 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 758 * has a free inode available, call xfs_iget() to obtain the in-core 759 * version of the allocated inode. Finally, fill in the inode and 760 * log its initial contents. In this case, ialloc_context would be 761 * set to NULL. 762 * 763 * If xfs_dialloc() does not have an available inode, it will replenish 764 * its supply by doing an allocation. Since we can only do one 765 * allocation within a transaction without deadlocks, we must commit 766 * the current transaction before returning the inode itself. 767 * In this case, therefore, we will set ialloc_context and return. 768 * The caller should then commit the current transaction, start a new 769 * transaction, and call xfs_ialloc() again to actually get the inode. 770 * 771 * To ensure that some other process does not grab the inode that 772 * was allocated during the first call to xfs_ialloc(), this routine 773 * also returns the [locked] bp pointing to the head of the freelist 774 * as ialloc_context. The caller should hold this buffer across 775 * the commit and pass it back into this routine on the second call. 776 * 777 * If we are allocating quota inodes, we do not have a parent inode 778 * to attach to or associate with (i.e. pip == NULL) because they 779 * are not linked into the directory structure - they are attached 780 * directly to the superblock - and so have no parent. 781 */ 782static int xfs_ialloc(xfs_trans_t *tp, xfs_inode_t *pip, umode_t mode, xfs_nlink_t nlink, dev_t rdev, prid_t prid, 783 xfs_buf_t **ialloc_context, xfs_inode_t **ipp) 784{ 785 struct xfs_mount *mp = tp->t_mountp; 786 xfs_ino_t ino; 787 xfs_inode_t *ip; 788 uint flags; 789 int error; 790 struct timespec64 tv; 791 struct inode *inode; 792 793 /* 794 * Call the space management code to pick 795 * the on-disk inode to be allocated. 796 */ 797 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, ialloc_context, &ino); 798 if (error) { 799 return error; 800 } 801 if (*ialloc_context || ino == NULLFSINO) { 802 *ipp = NULL; 803 return 0; 804 } 805 ASSERT(*ialloc_context == NULL); 806 807 /* 808 * Protect against obviously corrupt allocation btree records. Later 809 * xfs_iget checks will catch re-allocation of other active in-memory 810 * and on-disk inodes. If we don't catch reallocating the parent inode 811 * here we will deadlock in xfs_iget() so we have to do these checks 812 * first. 813 */ 814 if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { 815 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); 816 return -EFSCORRUPTED; 817 } 818 819 /* 820 * Get the in-core inode with the lock held exclusively. 821 * This is because we're setting fields here we need 822 * to prevent others from looking at until we're done. 823 */ 824 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 825 if (error) { 826 return error; 827 } 828 ASSERT(ip != NULL); 829 inode = VFS_I(ip); 830 inode->i_mode = mode; 831 set_nlink(inode, nlink); 832 inode->i_uid = current_fsuid(); 833 inode->i_rdev = rdev; 834 ip->i_d.di_projid = prid; 835 836 if (pip && XFS_INHERIT_GID(pip)) { 837 inode->i_gid = VFS_I(pip)->i_gid; 838 if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) { 839 inode->i_mode |= S_ISGID; 840 } 841 } else { 842 inode->i_gid = current_fsgid(); 843 } 844 845 /* 846 * If the group ID of the new file does not match the effective group 847 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 848 * (and only if the irix_sgid_inherit compatibility variable is set). 849 */ 850 if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) { 851 inode->i_mode &= ~S_ISGID; 852 } 853 854 ip->i_d.di_size = 0; 855 ip->i_df.if_nextents = 0; 856 ASSERT(ip->i_d.di_nblocks == 0); 857 858 tv = current_time(inode); 859 inode->i_mtime = tv; 860 inode->i_atime = tv; 861 inode->i_ctime = tv; 862 863 ip->i_d.di_extsize = 0; 864 ip->i_d.di_dmevmask = 0; 865 ip->i_d.di_dmstate = 0; 866 ip->i_d.di_flags = 0; 867 868 if (xfs_sb_version_has_v3inode(&mp->m_sb)) { 869 inode_set_iversion(inode, 1); 870 ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; 871 ip->i_d.di_cowextsize = 0; 872 ip->i_d.di_crtime = tv; 873 } 874 875 flags = XFS_ILOG_CORE; 876 switch (mode & S_IFMT) { 877 case S_IFIFO: 878 case S_IFCHR: 879 case S_IFBLK: 880 case S_IFSOCK: 881 ip->i_df.if_format = XFS_DINODE_FMT_DEV; 882 ip->i_df.if_flags = 0; 883 flags |= XFS_ILOG_DEV; 884 break; 885 case S_IFREG: 886 case S_IFDIR: 887 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 888 xfs_inode_inherit_flags(ip, pip); 889 } 890 if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { 891 xfs_inode_inherit_flags2(ip, pip); 892 } 893 /* FALLTHROUGH */ 894 case S_IFLNK: 895 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 896 ip->i_df.if_flags = XFS_IFEXTENTS; 897 ip->i_df.if_bytes = 0; 898 ip->i_df.if_u1.if_root = NULL; 899 break; 900 default: 901 ASSERT(0); 902 } 903 904 /* 905 * Log the new values stuffed into the inode. 906 */ 907 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 908 xfs_trans_log_inode(tp, ip, flags); 909 910 /* now that we have an i_mode we can setup the inode structure */ 911 xfs_setup_inode(ip); 912 913 *ipp = ip; 914 return 0; 915} 916 917/* 918 * Allocates a new inode from disk and return a pointer to the 919 * incore copy. This routine will internally commit the current 920 * transaction and allocate a new one if the Space Manager needed 921 * to do an allocation to replenish the inode free-list. 922 * 923 * This routine is designed to be called from xfs_create and 924 * xfs_create_dir. 925 * 926 */ 927int xfs_dir_ialloc(xfs_trans_t **tpp, /* input: current transaction; 928 output: may be a new transaction. */ 929 xfs_inode_t *dp, /* directory within whose allocate 930 the inode. */ 931 umode_t mode, xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ 932 xfs_inode_t **ipp) /* pointer to inode; it will be 933 locked. */ 934{ 935 xfs_trans_t *tp; 936 xfs_inode_t *ip; 937 xfs_buf_t *ialloc_context = NULL; 938 int code; 939 void *dqinfo; 940 uint tflags; 941 942 tp = *tpp; 943 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 944 945 /* 946 * xfs_ialloc will return a pointer to an incore inode if 947 * the Space Manager has an available inode on the free 948 * list. Otherwise, it will do an allocation and replenish 949 * the freelist. Since we can only do one allocation per 950 * transaction without deadlocks, we will need to commit the 951 * current transaction and start a new one. We will then 952 * need to call xfs_ialloc again to get the inode. 953 * 954 * If xfs_ialloc did an allocation to replenish the freelist, 955 * it returns the bp containing the head of the freelist as 956 * ialloc_context. We will hold a lock on it across the 957 * transaction commit so that no other process can steal 958 * the inode(s) that we've just allocated. 959 */ 960 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, &ip); 961 /* 962 * Return an error if we were unable to allocate a new inode. 963 * This should only happen if we run out of space on disk or 964 * encounter a disk error. 965 */ 966 if (code) { 967 *ipp = NULL; 968 return code; 969 } 970 if (!ialloc_context && !ip) { 971 *ipp = NULL; 972 return -ENOSPC; 973 } 974 975 /* 976 * If the AGI buffer is non-NULL, then we were unable to get an 977 * inode in one operation. We need to commit the current 978 * transaction and call xfs_ialloc() again. It is guaranteed 979 * to succeed the second time. 980 */ 981 if (ialloc_context) { 982 /* 983 * Normally, xfs_trans_commit releases all the locks. 984 * We call bhold to hang on to the ialloc_context across 985 * the commit. Holding this buffer prevents any other 986 * processes from doing any allocations in this 987 * allocation group. 988 */ 989 xfs_trans_bhold(tp, ialloc_context); 990 991 /* 992 * We want the quota changes to be associated with the next 993 * transaction, NOT this one. So, detach the dqinfo from this 994 * and attach it to the next transaction. 995 */ 996 dqinfo = NULL; 997 tflags = 0; 998 if (tp->t_dqinfo) { 999 dqinfo = (void *)tp->t_dqinfo; 1000 tp->t_dqinfo = NULL; 1001 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; 1002 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); 1003 } 1004 1005 code = xfs_trans_roll(&tp); 1006 1007 /* 1008 * Re-attach the quota info that we detached from prev trx. 1009 */ 1010 if (dqinfo) { 1011 tp->t_dqinfo = dqinfo; 1012 tp->t_flags |= tflags; 1013 } 1014 1015 if (code) { 1016 xfs_buf_relse(ialloc_context); 1017 *tpp = tp; 1018 *ipp = NULL; 1019 return code; 1020 } 1021 xfs_trans_bjoin(tp, ialloc_context); 1022 1023 /* 1024 * Call ialloc again. Since we've locked out all 1025 * other allocations in this allocation group, 1026 * this call should always succeed. 1027 */ 1028 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, &ip); 1029 /* 1030 * If we get an error at this point, return to the caller 1031 * so that the current transaction can be aborted. 1032 */ 1033 if (code) { 1034 *tpp = tp; 1035 *ipp = NULL; 1036 return code; 1037 } 1038 ASSERT(!ialloc_context && ip); 1039 } 1040 1041 *ipp = ip; 1042 *tpp = tp; 1043 1044 return 0; 1045} 1046 1047/* 1048 * Decrement the link count on an inode & log the change. If this causes the 1049 * link count to go to zero, move the inode to AGI unlinked list so that it can 1050 * be freed when the last active reference goes away via xfs_inactive(). 1051 */ 1052static int xfs_droplink(xfs_trans_t *tp, xfs_inode_t *ip) 1053{ 1054 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1055 1056 drop_nlink(VFS_I(ip)); 1057 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1058 1059 if (VFS_I(ip)->i_nlink) { 1060 return 0; 1061 } 1062 1063 return xfs_iunlink(tp, ip); 1064} 1065 1066/* 1067 * Increment the link count on an inode & log the change. 1068 */ 1069static void xfs_bumplink(xfs_trans_t *tp, xfs_inode_t *ip) 1070{ 1071 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1072 1073 inc_nlink(VFS_I(ip)); 1074 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1075} 1076 1077int xfs_create(xfs_inode_t *dp, struct xfs_name *name, umode_t mode, dev_t rdev, xfs_inode_t **ipp) 1078{ 1079 int is_dir = S_ISDIR(mode); 1080 struct xfs_mount *mp = dp->i_mount; 1081 struct xfs_inode *ip = NULL; 1082 struct xfs_trans *tp = NULL; 1083 int error; 1084 bool unlock_dp_on_error = false; 1085 prid_t prid; 1086 struct xfs_dquot *udqp = NULL; 1087 struct xfs_dquot *gdqp = NULL; 1088 struct xfs_dquot *pdqp = NULL; 1089 struct xfs_trans_res *tres; 1090 uint resblks; 1091 1092 trace_xfs_create(dp, name); 1093 1094 if (XFS_FORCED_SHUTDOWN(mp)) { 1095 return -EIO; 1096 } 1097 1098 prid = xfs_get_initial_prid(dp); 1099 1100 /* 1101 * Make sure that we have allocated dquot(s) on disk. 1102 */ 1103 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, 1104 &gdqp, &pdqp); 1105 if (error) { 1106 return error; 1107 } 1108 1109 if (is_dir) { 1110 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1111 tres = &M_RES(mp)->tr_mkdir; 1112 } else { 1113 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1114 tres = &M_RES(mp)->tr_create; 1115 } 1116 1117 /* 1118 * Initially assume that the file does not exist and 1119 * reserve the resources for that case. If that is not 1120 * the case we'll drop the one we have and get a more 1121 * appropriate transaction later. 1122 */ 1123 error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); 1124 if (error == -ENOSPC) { 1125 /* flush outstanding delalloc blocks and retry */ 1126 xfs_flush_inodes(mp); 1127 error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); 1128 } 1129 if (error) { 1130 goto out_release_inode; 1131 } 1132 1133 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1134 unlock_dp_on_error = true; 1135 1136 /* 1137 * Reserve disk quota and the inode. 1138 */ 1139 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); 1140 if (error) { 1141 goto out_trans_cancel; 1142 } 1143 1144 /* 1145 * A newly created regular or special file just has one directory 1146 * entry pointing to them, but a directory also the "." entry 1147 * pointing to itself. 1148 */ 1149 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 0x2 : 1, rdev, prid, &ip); 1150 if (error) { 1151 goto out_trans_cancel; 1152 } 1153 1154 /* 1155 * Now we join the directory inode to the transaction. We do not do it 1156 * earlier because xfs_dir_ialloc might commit the previous transaction 1157 * (and release all the locks). An error from here on will result in 1158 * the transaction cancel unlocking dp so don't do it explicitly in the 1159 * error path. 1160 */ 1161 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1162 unlock_dp_on_error = false; 1163 1164 error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks - XFS_IALLOC_SPACE_RES(mp)); 1165 if (error) { 1166 ASSERT(error != -ENOSPC); 1167 goto out_trans_cancel; 1168 } 1169 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1170 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1171 1172 if (is_dir) { 1173 error = xfs_dir_init(tp, ip, dp); 1174 if (error) { 1175 goto out_trans_cancel; 1176 } 1177 1178 xfs_bumplink(tp, dp); 1179 } 1180 1181 /* 1182 * If this is a synchronous mount, make sure that the 1183 * create transaction goes to disk before returning to 1184 * the user. 1185 */ 1186 if (mp->m_flags & (XFS_MOUNT_WSYNC | XFS_MOUNT_DIRSYNC)) { 1187 xfs_trans_set_sync(tp); 1188 } 1189 1190 /* 1191 * Attach the dquot(s) to the inodes and modify them incore. 1192 * These ids of the inode couldn't have changed since the new 1193 * inode has been locked ever since it was created. 1194 */ 1195 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1196 1197 error = xfs_trans_commit(tp); 1198 if (error) { 1199 goto out_release_inode; 1200 } 1201 1202 xfs_qm_dqrele(udqp); 1203 xfs_qm_dqrele(gdqp); 1204 xfs_qm_dqrele(pdqp); 1205 1206 *ipp = ip; 1207 return 0; 1208 1209out_trans_cancel: 1210 xfs_trans_cancel(tp); 1211out_release_inode: 1212 /* 1213 * Wait until after the current transaction is aborted to finish the 1214 * setup of the inode and release the inode. This prevents recursive 1215 * transactions and deadlocks from xfs_inactive. 1216 */ 1217 if (ip) { 1218 xfs_finish_inode_setup(ip); 1219 xfs_irele(ip); 1220 } 1221 1222 xfs_qm_dqrele(udqp); 1223 xfs_qm_dqrele(gdqp); 1224 xfs_qm_dqrele(pdqp); 1225 1226 if (unlock_dp_on_error) { 1227 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1228 } 1229 return error; 1230} 1231 1232int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp) 1233{ 1234 struct xfs_mount *mp = dp->i_mount; 1235 struct xfs_inode *ip = NULL; 1236 struct xfs_trans *tp = NULL; 1237 int error; 1238 prid_t prid; 1239 struct xfs_dquot *udqp = NULL; 1240 struct xfs_dquot *gdqp = NULL; 1241 struct xfs_dquot *pdqp = NULL; 1242 struct xfs_trans_res *tres; 1243 uint resblks; 1244 1245 if (XFS_FORCED_SHUTDOWN(mp)) { 1246 return -EIO; 1247 } 1248 1249 prid = xfs_get_initial_prid(dp); 1250 1251 /* 1252 * Make sure that we have allocated dquot(s) on disk. 1253 */ 1254 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, 1255 &gdqp, &pdqp); 1256 if (error) { 1257 return error; 1258 } 1259 1260 resblks = XFS_IALLOC_SPACE_RES(mp); 1261 tres = &M_RES(mp)->tr_create_tmpfile; 1262 1263 error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); 1264 if (error) { 1265 goto out_release_inode; 1266 } 1267 1268 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); 1269 if (error) { 1270 goto out_trans_cancel; 1271 } 1272 1273 error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); 1274 if (error) { 1275 goto out_trans_cancel; 1276 } 1277 1278 if (mp->m_flags & XFS_MOUNT_WSYNC) { 1279 xfs_trans_set_sync(tp); 1280 } 1281 1282 /* 1283 * Attach the dquot(s) to the inodes and modify them incore. 1284 * These ids of the inode couldn't have changed since the new 1285 * inode has been locked ever since it was created. 1286 */ 1287 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1288 1289 error = xfs_iunlink(tp, ip); 1290 if (error) { 1291 goto out_trans_cancel; 1292 } 1293 1294 error = xfs_trans_commit(tp); 1295 if (error) { 1296 goto out_release_inode; 1297 } 1298 1299 xfs_qm_dqrele(udqp); 1300 xfs_qm_dqrele(gdqp); 1301 xfs_qm_dqrele(pdqp); 1302 1303 *ipp = ip; 1304 return 0; 1305 1306out_trans_cancel: 1307 xfs_trans_cancel(tp); 1308out_release_inode: 1309 /* 1310 * Wait until after the current transaction is aborted to finish the 1311 * setup of the inode and release the inode. This prevents recursive 1312 * transactions and deadlocks from xfs_inactive. 1313 */ 1314 if (ip) { 1315 xfs_finish_inode_setup(ip); 1316 xfs_irele(ip); 1317 } 1318 1319 xfs_qm_dqrele(udqp); 1320 xfs_qm_dqrele(gdqp); 1321 xfs_qm_dqrele(pdqp); 1322 1323 return error; 1324} 1325 1326int xfs_link(xfs_inode_t *tdp, xfs_inode_t *sip, struct xfs_name *target_name) 1327{ 1328 xfs_mount_t *mp = tdp->i_mount; 1329 xfs_trans_t *tp; 1330 int error; 1331 int resblks; 1332 1333 trace_xfs_link(tdp, target_name); 1334 1335 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); 1336 1337 if (XFS_FORCED_SHUTDOWN(mp)) { 1338 return -EIO; 1339 } 1340 1341 error = xfs_qm_dqattach(sip); 1342 if (error) { 1343 goto std_return; 1344 } 1345 1346 error = xfs_qm_dqattach(tdp); 1347 if (error) { 1348 goto std_return; 1349 } 1350 1351 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1352 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); 1353 if (error == -ENOSPC) { 1354 resblks = 0; 1355 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); 1356 } 1357 if (error) { 1358 goto std_return; 1359 } 1360 1361 xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); 1362 1363 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); 1364 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); 1365 1366 /* 1367 * If we are using project inheritance, we only allow hard link 1368 * creation in our tree when the project IDs are the same; else 1369 * the tree quota mechanism could be circumvented. 1370 */ 1371 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && tdp->i_d.di_projid != sip->i_d.di_projid)) { 1372 error = -EXDEV; 1373 goto error_return; 1374 } 1375 1376 if (!resblks) { 1377 error = xfs_dir_canenter(tp, tdp, target_name); 1378 if (error) { 1379 goto error_return; 1380 } 1381 } 1382 1383 /* 1384 * Handle initial link state of O_TMPFILE inode 1385 */ 1386 if (VFS_I(sip)->i_nlink == 0) { 1387 error = xfs_iunlink_remove(tp, sip); 1388 if (error) { 1389 goto error_return; 1390 } 1391 } 1392 1393 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, resblks); 1394 if (error) { 1395 goto error_return; 1396 } 1397 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1398 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1399 1400 xfs_bumplink(tp, sip); 1401 1402 /* 1403 * If this is a synchronous mount, make sure that the 1404 * link transaction goes to disk before returning to 1405 * the user. 1406 */ 1407 if (mp->m_flags & (XFS_MOUNT_WSYNC | XFS_MOUNT_DIRSYNC)) { 1408 xfs_trans_set_sync(tp); 1409 } 1410 1411 return xfs_trans_commit(tp); 1412 1413error_return: 1414 xfs_trans_cancel(tp); 1415std_return: 1416 return error; 1417} 1418 1419/* Clear the reflink flag and the cowblocks tag if possible. */ 1420static void xfs_itruncate_clear_reflink_flags(struct xfs_inode *ip) 1421{ 1422 struct xfs_ifork *dfork; 1423 struct xfs_ifork *cfork; 1424 1425 if (!xfs_is_reflink_inode(ip)) { 1426 return; 1427 } 1428 dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1429 cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1430 if (dfork->if_bytes == 0 && cfork->if_bytes == 0) { 1431 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1432 } 1433 if (cfork->if_bytes == 0) { 1434 xfs_inode_clear_cowblocks_tag(ip); 1435 } 1436} 1437 1438/* 1439 * Free up the underlying blocks past new_size. The new size must be smaller 1440 * than the current size. This routine can be used both for the attribute and 1441 * data fork, and does not modify the inode size, which is left to the caller. 1442 * 1443 * The transaction passed to this routine must have made a permanent log 1444 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1445 * given transaction and start new ones, so make sure everything involved in 1446 * the transaction is tidy before calling here. Some transaction will be 1447 * returned to the caller to be committed. The incoming transaction must 1448 * already include the inode, and both inode locks must be held exclusively. 1449 * The inode must also be "held" within the transaction. On return the inode 1450 * will be "held" within the returned transaction. This routine does NOT 1451 * require any disk space to be reserved for it within the transaction. 1452 * 1453 * If we get an error, we must return with the inode locked and linked into the 1454 * current transaction. This keeps things simple for the higher level code, 1455 * because it always knows that the inode is locked and held in the transaction 1456 * that returns to it whether errors occur or not. We don't mark the inode 1457 * dirty on error so that transactions can be easily aborted if possible. 1458 */ 1459int xfs_itruncate_extents_flags(struct xfs_trans **tpp, struct xfs_inode *ip, int whichfork, xfs_fsize_t new_size, 1460 int flags) 1461{ 1462 struct xfs_mount *mp = ip->i_mount; 1463 struct xfs_trans *tp = *tpp; 1464 xfs_fileoff_t first_unmap_block; 1465 xfs_filblks_t unmap_len; 1466 int error = 0; 1467 1468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1469 ASSERT(!atomic_read(&VFS_I(ip)->i_count) || xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1470 ASSERT(new_size <= XFS_ISIZE(ip)); 1471 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1472 ASSERT(ip->i_itemp != NULL); 1473 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1474 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1475 1476 trace_xfs_itruncate_extents_start(ip, new_size); 1477 1478 flags |= xfs_bmapi_aflag(whichfork); 1479 1480 /* 1481 * Since it is possible for space to become allocated beyond 1482 * the end of the file (in a crash where the space is allocated 1483 * but the inode size is not yet updated), simply remove any 1484 * blocks which show up between the new EOF and the maximum 1485 * possible file size. 1486 * 1487 * We have to free all the blocks to the bmbt maximum offset, even if 1488 * the page cache can't scale that far. 1489 */ 1490 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1491 if (first_unmap_block >= XFS_MAX_FILEOFF) { 1492 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); 1493 return 0; 1494 } 1495 1496 unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; 1497 while (unmap_len > 0) { 1498 ASSERT(tp->t_firstblock == NULLFSBLOCK); 1499 error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS); 1500 if (error) { 1501 goto out; 1502 } 1503 1504 /* free the just unmapped extents */ 1505 error = xfs_defer_finish(&tp); 1506 if (error) { 1507 goto out; 1508 } 1509 } 1510 1511 if (whichfork == XFS_DATA_FORK) { 1512 /* Remove all pending CoW reservations. */ 1513 error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, XFS_MAX_FILEOFF, true); 1514 if (error) { 1515 goto out; 1516 } 1517 1518 xfs_itruncate_clear_reflink_flags(ip); 1519 } 1520 1521 /* 1522 * Always re-log the inode so that our permanent transaction can keep 1523 * on rolling it forward in the log. 1524 */ 1525 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1526 1527 trace_xfs_itruncate_extents_end(ip, new_size); 1528 1529out: 1530 *tpp = tp; 1531 return error; 1532} 1533 1534int xfs_release(xfs_inode_t *ip) 1535{ 1536 xfs_mount_t *mp = ip->i_mount; 1537 int error; 1538 1539 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) { 1540 return 0; 1541 } 1542 1543 /* If this is a read-only mount, don't do this (would generate I/O) */ 1544 if (mp->m_flags & XFS_MOUNT_RDONLY) { 1545 return 0; 1546 } 1547 1548 if (!XFS_FORCED_SHUTDOWN(mp)) { 1549 int truncated; 1550 1551 /* 1552 * If we previously truncated this file and removed old data 1553 * in the process, we want to initiate "early" writeout on 1554 * the last close. This is an attempt to combat the notorious 1555 * NULL files problem which is particularly noticeable from a 1556 * truncate down, buffered (re-)write (delalloc), followed by 1557 * a crash. What we are effectively doing here is 1558 * significantly reducing the time window where we'd otherwise 1559 * be exposed to that problem. 1560 */ 1561 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1562 if (truncated) { 1563 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1564 if (ip->i_delayed_blks > 0) { 1565 error = filemap_flush(VFS_I(ip)->i_mapping); 1566 if (error) { 1567 return error; 1568 } 1569 } 1570 } 1571 } 1572 1573 if (VFS_I(ip)->i_nlink == 0) { 1574 return 0; 1575 } 1576 1577 if (xfs_can_free_eofblocks(ip, false)) { 1578 /* 1579 * Check if the inode is being opened, written and closed 1580 * frequently and we have delayed allocation blocks outstanding 1581 * (e.g. streaming writes from the NFS server), truncating the 1582 * blocks past EOF will cause fragmentation to occur. 1583 * 1584 * In this case don't do the truncation, but we have to be 1585 * careful how we detect this case. Blocks beyond EOF show up as 1586 * i_delayed_blks even when the inode is clean, so we need to 1587 * truncate them away first before checking for a dirty release. 1588 * Hence on the first dirty close we will still remove the 1589 * speculative allocation, but after that we will leave it in 1590 * place. 1591 */ 1592 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) { 1593 return 0; 1594 } 1595 /* 1596 * If we can't get the iolock just skip truncating the blocks 1597 * past EOF because we could deadlock with the mmap_lock 1598 * otherwise. We'll get another chance to drop them once the 1599 * last reference to the inode is dropped, so we'll never leak 1600 * blocks permanently. 1601 */ 1602 if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1603 error = xfs_free_eofblocks(ip); 1604 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1605 if (error) { 1606 return error; 1607 } 1608 } 1609 1610 /* delalloc blocks after truncation means it really is dirty */ 1611 if (ip->i_delayed_blks) { 1612 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1613 } 1614 } 1615 return 0; 1616} 1617 1618/* 1619 * xfs_inactive_truncate 1620 * 1621 * Called to perform a truncate when an inode becomes unlinked. 1622 */ 1623STATIC int xfs_inactive_truncate(struct xfs_inode *ip) 1624{ 1625 struct xfs_mount *mp = ip->i_mount; 1626 struct xfs_trans *tp; 1627 int error; 1628 1629 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 1630 if (error) { 1631 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1632 return error; 1633 } 1634 xfs_ilock(ip, XFS_ILOCK_EXCL); 1635 xfs_trans_ijoin(tp, ip, 0); 1636 1637 /* 1638 * Log the inode size first to prevent stale data exposure in the event 1639 * of a system crash before the truncate completes. See the related 1640 * comment in xfs_vn_setattr_size() for details. 1641 */ 1642 ip->i_d.di_size = 0; 1643 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1644 1645 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1646 if (error) { 1647 goto error_trans_cancel; 1648 } 1649 1650 ASSERT(ip->i_df.if_nextents == 0); 1651 1652 error = xfs_trans_commit(tp); 1653 if (error) { 1654 goto error_unlock; 1655 } 1656 1657 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1658 return 0; 1659 1660error_trans_cancel: 1661 xfs_trans_cancel(tp); 1662error_unlock: 1663 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1664 return error; 1665} 1666 1667/* 1668 * xfs_inactive_ifree() 1669 * 1670 * Perform the inode free when an inode is unlinked. 1671 */ 1672STATIC int xfs_inactive_ifree(struct xfs_inode *ip) 1673{ 1674 struct xfs_mount *mp = ip->i_mount; 1675 struct xfs_trans *tp; 1676 int error; 1677 1678 /* 1679 * We try to use a per-AG reservation for any block needed by the finobt 1680 * tree, but as the finobt feature predates the per-AG reservation 1681 * support a degraded file system might not have enough space for the 1682 * reservation at mount time. In that case try to dip into the reserved 1683 * pool and pray. 1684 * 1685 * Send a warning if the reservation does happen to fail, as the inode 1686 * now remains allocated and sits on the unlinked list until the fs is 1687 * repaired. 1688 */ 1689 if (unlikely(mp->m_finobt_nores)) { 1690 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); 1691 } else { 1692 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); 1693 } 1694 if (error) { 1695 if (error == -ENOSPC) { 1696 xfs_warn_ratelimited(mp, "Failed to remove inode(s) from unlinked list. " 1697 "Please free space, unmount and run xfs_repair."); 1698 } else { 1699 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1700 } 1701 return error; 1702 } 1703 1704 /* 1705 * We do not hold the inode locked across the entire rolling transaction 1706 * here. We only need to hold it for the first transaction that 1707 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1708 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1709 * here breaks the relationship between cluster buffer invalidation and 1710 * stale inode invalidation on cluster buffer item journal commit 1711 * completion, and can result in leaving dirty stale inodes hanging 1712 * around in memory. 1713 * 1714 * We have no need for serialising this inode operation against other 1715 * operations - we freed the inode and hence reallocation is required 1716 * and that will serialise on reallocating the space the deferops need 1717 * to free. Hence we can unlock the inode on the first commit of 1718 * the transaction rather than roll it right through the deferops. This 1719 * avoids relogging the XFS_ISTALE inode. 1720 * 1721 * We check that xfs_ifree() hasn't grown an internal transaction roll 1722 * by asserting that the inode is still locked when it returns. 1723 */ 1724 xfs_ilock(ip, XFS_ILOCK_EXCL); 1725 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1726 1727 error = xfs_ifree(tp, ip); 1728 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1729 if (error) { 1730 /* 1731 * If we fail to free the inode, shut down. The cancel 1732 * might do that, we need to make sure. Otherwise the 1733 * inode might be lost for a long time or forever. 1734 */ 1735 if (!XFS_FORCED_SHUTDOWN(mp)) { 1736 xfs_notice(mp, "%s: xfs_ifree returned error %d", __func__, error); 1737 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1738 } 1739 xfs_trans_cancel(tp); 1740 return error; 1741 } 1742 1743 /* 1744 * Credit the quota account(s). The inode is gone. 1745 */ 1746 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1747 1748 /* 1749 * Just ignore errors at this point. There is nothing we can do except 1750 * to try to keep going. Make sure it's not a silent error. 1751 */ 1752 error = xfs_trans_commit(tp); 1753 if (error) { 1754 xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); 1755 } 1756 1757 return 0; 1758} 1759 1760/* 1761 * xfs_inactive 1762 * 1763 * This is called when the vnode reference count for the vnode 1764 * goes to zero. If the file has been unlinked, then it must 1765 * now be truncated. Also, we clear all of the read-ahead state 1766 * kept for the inode here since the file is now closed. 1767 */ 1768void xfs_inactive(xfs_inode_t *ip) 1769{ 1770 struct xfs_mount *mp; 1771 int error; 1772 int truncate = 0; 1773 1774 /* 1775 * If the inode is already free, then there can be nothing 1776 * to clean up here. 1777 */ 1778 if (VFS_I(ip)->i_mode == 0) { 1779 ASSERT(ip->i_df.if_broot_bytes == 0); 1780 return; 1781 } 1782 1783 mp = ip->i_mount; 1784 ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); 1785 1786 /* If this is a read-only mount, don't do this (would generate I/O) */ 1787 if (mp->m_flags & XFS_MOUNT_RDONLY) { 1788 return; 1789 } 1790 1791 /* Try to clean out the cow blocks if there are any. */ 1792 if (xfs_inode_has_cow_data(ip)) { 1793 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); 1794 } 1795 1796 if (VFS_I(ip)->i_nlink != 0) { 1797 /* 1798 * force is true because we are evicting an inode from the 1799 * cache. Post-eof blocks must be freed, lest we end up with 1800 * broken free space accounting. 1801 * 1802 * Note: don't bother with iolock here since lockdep complains 1803 * about acquiring it in reclaim context. We have the only 1804 * reference to the inode at this point anyways. 1805 */ 1806 if (xfs_can_free_eofblocks(ip, true)) { 1807 xfs_free_eofblocks(ip); 1808 } 1809 1810 return; 1811 } 1812 1813 if (S_ISREG(VFS_I(ip)->i_mode) && 1814 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) { 1815 truncate = 1; 1816 } 1817 1818 error = xfs_qm_dqattach(ip); 1819 if (error) { 1820 return; 1821 } 1822 1823 if (S_ISLNK(VFS_I(ip)->i_mode)) { 1824 error = xfs_inactive_symlink(ip); 1825 } else if (truncate) { 1826 error = xfs_inactive_truncate(ip); 1827 } 1828 if (error) { 1829 return; 1830 } 1831 1832 /* 1833 * If there are attributes associated with the file then blow them away 1834 * now. The code calls a routine that recursively deconstructs the 1835 * attribute fork. If also blows away the in-core attribute fork. 1836 */ 1837 if (XFS_IFORK_Q(ip)) { 1838 error = xfs_attr_inactive(ip); 1839 if (error) { 1840 return; 1841 } 1842 } 1843 1844 ASSERT(!ip->i_afp); 1845 ASSERT(ip->i_d.di_forkoff == 0); 1846 1847 /* 1848 * Free the inode. 1849 */ 1850 error = xfs_inactive_ifree(ip); 1851 if (error) { 1852 return; 1853 } 1854 1855 /* 1856 * Release the dquots held by inode, if any. 1857 */ 1858 xfs_qm_dqdetach(ip); 1859} 1860 1861/* 1862 * In-Core Unlinked List Lookups 1863 * ============================= 1864 * 1865 * Every inode is supposed to be reachable from some other piece of metadata 1866 * with the exception of the root directory. Inodes with a connection to a 1867 * file descriptor but not linked from anywhere in the on-disk directory tree 1868 * are collectively known as unlinked inodes, though the filesystem itself 1869 * maintains links to these inodes so that on-disk metadata are consistent. 1870 * 1871 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI 1872 * header contains a number of buckets that point to an inode, and each inode 1873 * record has a pointer to the next inode in the hash chain. This 1874 * singly-linked list causes scaling problems in the iunlink remove function 1875 * because we must walk that list to find the inode that points to the inode 1876 * being removed from the unlinked hash bucket list. 1877 * 1878 * What if we modelled the unlinked list as a collection of records capturing 1879 * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd 1880 * have a fast way to look up unlinked list predecessors, which avoids the 1881 * slow list walk. That's exactly what we do here (in-core) with a per-AG 1882 * rhashtable. 1883 * 1884 * Because this is a backref cache, we ignore operational failures since the 1885 * iunlink code can fall back to the slow bucket walk. The only errors that 1886 * should bubble out are for obviously incorrect situations. 1887 * 1888 * All users of the backref cache MUST hold the AGI buffer lock to serialize 1889 * access or have otherwise provided for concurrency control. 1890 */ 1891 1892/* Capture a "X.next_unlinked = Y" relationship. */ 1893struct xfs_iunlink { 1894 struct rhash_head iu_rhash_head; 1895 xfs_agino_t iu_agino; /* X */ 1896 xfs_agino_t iu_next_unlinked; /* Y */ 1897}; 1898 1899/* Unlinked list predecessor lookup hashtable construction */ 1900static int xfs_iunlink_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) 1901{ 1902 const xfs_agino_t *key = arg->key; 1903 const struct xfs_iunlink *iu = obj; 1904 1905 if (iu->iu_next_unlinked != *key) { 1906 return 1; 1907 } 1908 return 0; 1909} 1910 1911static const struct rhashtable_params xfs_iunlink_hash_params = { 1912 .min_size = XFS_AGI_UNLINKED_BUCKETS, 1913 .key_len = sizeof(xfs_agino_t), 1914 .key_offset = offsetof(struct xfs_iunlink, iu_next_unlinked), 1915 .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), 1916 .automatic_shrinking = true, 1917 .obj_cmpfn = xfs_iunlink_obj_cmpfn, 1918}; 1919 1920/* 1921 * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such 1922 * relation is found. 1923 */ 1924static xfs_agino_t xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino) 1925{ 1926 struct xfs_iunlink *iu; 1927 1928 iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, xfs_iunlink_hash_params); 1929 return iu ? iu->iu_agino : NULLAGINO; 1930} 1931 1932/* 1933 * Take ownership of an iunlink cache entry and insert it into the hash table. 1934 * If successful, the entry will be owned by the cache; if not, it is freed. 1935 * Either way, the caller does not own @iu after this call. 1936 */ 1937static int xfs_iunlink_insert_backref(struct xfs_perag *pag, struct xfs_iunlink *iu) 1938{ 1939 int error; 1940 1941 error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, &iu->iu_rhash_head, xfs_iunlink_hash_params); 1942 /* 1943 * Fail loudly if there already was an entry because that's a sign of 1944 * corruption of in-memory data. Also fail loudly if we see an error 1945 * code we didn't anticipate from the rhashtable code. Currently we 1946 * only anticipate ENOMEM. 1947 */ 1948 if (error) { 1949 WARN(error != -ENOMEM, "iunlink cache insert error %d", error); 1950 kmem_free(iu); 1951 } 1952 /* 1953 * Absorb any runtime errors that aren't a result of corruption because 1954 * this is a cache and we can always fall back to bucket list scanning. 1955 */ 1956 if (error != 0 && error != -EEXIST) { 1957 error = 0; 1958 } 1959 return error; 1960} 1961 1962/* Remember that @prev_agino.next_unlinked = @this_agino. */ 1963static int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino, xfs_agino_t this_agino) 1964{ 1965 struct xfs_iunlink *iu; 1966 1967 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) { 1968 return 0; 1969 } 1970 1971 iu = kmem_zalloc(sizeof(*iu), KM_NOFS); 1972 iu->iu_agino = prev_agino; 1973 iu->iu_next_unlinked = this_agino; 1974 1975 return xfs_iunlink_insert_backref(pag, iu); 1976} 1977 1978/* 1979 * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. 1980 * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there 1981 * wasn't any such entry then we don't bother. 1982 */ 1983static int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t agino, xfs_agino_t next_unlinked) 1984{ 1985 struct xfs_iunlink *iu; 1986 int error; 1987 1988 /* Look up the old entry; if there wasn't one then exit. */ 1989 iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, xfs_iunlink_hash_params); 1990 if (!iu) { 1991 return 0; 1992 } 1993 1994 /* 1995 * Remove the entry. This shouldn't ever return an error, but if we 1996 * couldn't remove the old entry we don't want to add it again to the 1997 * hash table, and if the entry disappeared on us then someone's 1998 * violated the locking rules and we need to fail loudly. Either way 1999 * we cannot remove the inode because internal state is or would have 2000 * been corrupt. 2001 */ 2002 error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, &iu->iu_rhash_head, xfs_iunlink_hash_params); 2003 if (error) { 2004 return error; 2005 } 2006 2007 /* If there is no new next entry just free our item and return. */ 2008 if (next_unlinked == NULLAGINO) { 2009 kmem_free(iu); 2010 return 0; 2011 } 2012 2013 /* Update the entry and re-add it to the hash table. */ 2014 iu->iu_next_unlinked = next_unlinked; 2015 return xfs_iunlink_insert_backref(pag, iu); 2016} 2017 2018/* Set up the in-core predecessor structures. */ 2019int xfs_iunlink_init(struct xfs_perag *pag) 2020{ 2021 return rhashtable_init(&pag->pagi_unlinked_hash, &xfs_iunlink_hash_params); 2022} 2023 2024/* Free the in-core predecessor structures. */ 2025static void xfs_iunlink_free_item(void *ptr, void *arg) 2026{ 2027 struct xfs_iunlink *iu = ptr; 2028 bool *freed_anything = arg; 2029 2030 *freed_anything = true; 2031 kmem_free(iu); 2032} 2033 2034void xfs_iunlink_destroy(struct xfs_perag *pag) 2035{ 2036 bool freed_anything = false; 2037 2038 rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, xfs_iunlink_free_item, &freed_anything); 2039 2040 ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); 2041} 2042 2043/* 2044 * Point the AGI unlinked bucket at an inode and log the results. The caller 2045 * is responsible for validating the old value. 2046 */ 2047STATIC int xfs_iunlink_update_bucket(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf *agibp, 2048 unsigned int bucket_index, xfs_agino_t new_agino) 2049{ 2050 struct xfs_agi *agi = agibp->b_addr; 2051 xfs_agino_t old_value; 2052 int offset; 2053 2054 ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); 2055 2056 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2057 trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, old_value, new_agino); 2058 2059 /* 2060 * We should never find the head of the list already set to the value 2061 * passed in because either we're adding or removing ourselves from the 2062 * head of the list. 2063 */ 2064 if (old_value == new_agino) { 2065 xfs_buf_mark_corrupt(agibp); 2066 return -EFSCORRUPTED; 2067 } 2068 2069 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); 2070 offset = offsetof(struct xfs_agi, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); 2071 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); 2072 return 0; 2073} 2074 2075/* Set an on-disk inode's next_unlinked pointer. */ 2076STATIC void xfs_iunlink_update_dinode(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agino_t agino, struct xfs_buf *ibp, 2077 struct xfs_dinode *dip, struct xfs_imap *imap, xfs_agino_t next_agino) 2078{ 2079 struct xfs_mount *mp = tp->t_mountp; 2080 int offset; 2081 2082 ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); 2083 2084 trace_xfs_iunlink_update_dinode(mp, agno, agino, be32_to_cpu(dip->di_next_unlinked), next_agino); 2085 2086 dip->di_next_unlinked = cpu_to_be32(next_agino); 2087 offset = imap->im_boffset + offsetof(struct xfs_dinode, di_next_unlinked); 2088 2089 /* need to recalc the inode CRC if appropriate */ 2090 xfs_dinode_calc_crc(mp, dip); 2091 xfs_trans_inode_buf(tp, ibp); 2092 xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); 2093} 2094 2095/* Set an in-core inode's unlinked pointer and return the old value. */ 2096STATIC int xfs_iunlink_update_inode(struct xfs_trans *tp, struct xfs_inode *ip, xfs_agnumber_t agno, 2097 xfs_agino_t next_agino, xfs_agino_t *old_next_agino) 2098{ 2099 struct xfs_mount *mp = tp->t_mountp; 2100 struct xfs_dinode *dip; 2101 struct xfs_buf *ibp; 2102 xfs_agino_t old_value; 2103 int error; 2104 2105 ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); 2106 2107 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); 2108 if (error) { 2109 return error; 2110 } 2111 2112 /* Make sure the old pointer isn't garbage. */ 2113 old_value = be32_to_cpu(dip->di_next_unlinked); 2114 if (!xfs_verify_agino_or_null(mp, agno, old_value)) { 2115 xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); 2116 error = -EFSCORRUPTED; 2117 goto out; 2118 } 2119 2120 /* 2121 * Since we're updating a linked list, we should never find that the 2122 * current pointer is the same as the new value, unless we're 2123 * terminating the list. 2124 */ 2125 *old_next_agino = old_value; 2126 if (old_value == next_agino) { 2127 if (next_agino != NULLAGINO) { 2128 xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); 2129 error = -EFSCORRUPTED; 2130 } 2131 goto out; 2132 } 2133 2134 /* Ok, update the new pointer. */ 2135 xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), ibp, dip, &ip->i_imap, next_agino); 2136 return 0; 2137out: 2138 xfs_trans_brelse(tp, ibp); 2139 return error; 2140} 2141 2142/* 2143 * This is called when the inode's link count has gone to 0 or we are creating 2144 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. 2145 * 2146 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2147 * list when the inode is freed. 2148 */ 2149STATIC int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip) 2150{ 2151 struct xfs_mount *mp = tp->t_mountp; 2152 struct xfs_agi *agi; 2153 struct xfs_buf *agibp; 2154 xfs_agino_t next_agino; 2155 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 2156 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2157 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2158 int error; 2159 2160 ASSERT(VFS_I(ip)->i_nlink == 0); 2161 ASSERT(VFS_I(ip)->i_mode != 0); 2162 trace_xfs_iunlink(ip); 2163 2164 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2165 error = xfs_read_agi(mp, tp, agno, &agibp); 2166 if (error) { 2167 return error; 2168 } 2169 agi = agibp->b_addr; 2170 2171 /* 2172 * Get the index into the agi hash table for the list this inode will 2173 * go on. Make sure the pointer isn't garbage and that this inode 2174 * isn't already on the list. 2175 */ 2176 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2177 if (next_agino == agino || !xfs_verify_agino_or_null(mp, agno, next_agino)) { 2178 xfs_buf_mark_corrupt(agibp); 2179 return -EFSCORRUPTED; 2180 } 2181 2182 if (next_agino != NULLAGINO) { 2183 xfs_agino_t old_agino; 2184 2185 /* 2186 * There is already another inode in the bucket, so point this 2187 * inode to the current head of the list. 2188 */ 2189 error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, &old_agino); 2190 if (error) { 2191 return error; 2192 } 2193 ASSERT(old_agino == NULLAGINO); 2194 2195 /* 2196 * agino has been unlinked, add a backref from the next inode 2197 * back to agino. 2198 */ 2199 error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); 2200 if (error) { 2201 return error; 2202 } 2203 } 2204 2205 /* Point the head of the list to point to this inode. */ 2206 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); 2207} 2208 2209/* Return the imap, dinode pointer, and buffer for an inode. */ 2210STATIC int xfs_iunlink_map_ino(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agino_t agino, struct xfs_imap *imap, 2211 struct xfs_dinode **dipp, struct xfs_buf **bpp) 2212{ 2213 struct xfs_mount *mp = tp->t_mountp; 2214 int error; 2215 2216 imap->im_blkno = 0; 2217 error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); 2218 if (error) { 2219 xfs_warn(mp, "%s: xfs_imap returned error %d.", __func__, error); 2220 return error; 2221 } 2222 2223 error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); 2224 if (error) { 2225 xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); 2226 return error; 2227 } 2228 2229 return 0; 2230} 2231 2232/* 2233 * Walk the unlinked chain from @head_agino until we find the inode that 2234 * points to @target_agino. Return the inode number, map, dinode pointer, 2235 * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. 2236 * 2237 * @tp, @pag, @head_agino, and @target_agino are input parameters. 2238 * @agino, @imap, @dipp, and @bpp are all output parameters. 2239 * 2240 * Do not call this function if @target_agino is the head of the list. 2241 */ 2242STATIC int xfs_iunlink_map_prev(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agino_t head_agino, 2243 xfs_agino_t target_agino, xfs_agino_t *agino, struct xfs_imap *imap, 2244 struct xfs_dinode **dipp, struct xfs_buf **bpp, struct xfs_perag *pag) 2245{ 2246 struct xfs_mount *mp = tp->t_mountp; 2247 xfs_agino_t next_agino; 2248 int error; 2249 2250 ASSERT(head_agino != target_agino); 2251 *bpp = NULL; 2252 2253 /* See if our backref cache can find it faster. */ 2254 *agino = xfs_iunlink_lookup_backref(pag, target_agino); 2255 if (*agino != NULLAGINO) { 2256 error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); 2257 if (error) { 2258 return error; 2259 } 2260 2261 if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) { 2262 return 0; 2263 } 2264 2265 /* 2266 * If we get here the cache contents were corrupt, so drop the 2267 * buffer and fall back to walking the bucket list. 2268 */ 2269 xfs_trans_brelse(tp, *bpp); 2270 *bpp = NULL; 2271 WARN_ON_ONCE(1); 2272 } 2273 2274 trace_xfs_iunlink_map_prev_fallback(mp, agno); 2275 2276 /* Otherwise, walk the entire bucket until we find it. */ 2277 next_agino = head_agino; 2278 while (next_agino != target_agino) { 2279 xfs_agino_t unlinked_agino; 2280 2281 if (*bpp) { 2282 xfs_trans_brelse(tp, *bpp); 2283 } 2284 2285 *agino = next_agino; 2286 error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, bpp); 2287 if (error) { 2288 return error; 2289 } 2290 2291 unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); 2292 /* 2293 * Make sure this pointer is valid and isn't an obvious 2294 * infinite loop. 2295 */ 2296 if (!xfs_verify_agino(mp, agno, unlinked_agino) || next_agino == unlinked_agino) { 2297 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, *dipp, sizeof(**dipp)); 2298 error = -EFSCORRUPTED; 2299 return error; 2300 } 2301 next_agino = unlinked_agino; 2302 } 2303 2304 return 0; 2305} 2306 2307/* 2308 * Pull the on-disk inode from the AGI unlinked list. 2309 */ 2310STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_inode *ip) 2311{ 2312 struct xfs_mount *mp = tp->t_mountp; 2313 struct xfs_agi *agi; 2314 struct xfs_buf *agibp; 2315 struct xfs_buf *last_ibp; 2316 struct xfs_dinode *last_dip = NULL; 2317 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 2318 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2319 xfs_agino_t next_agino; 2320 xfs_agino_t head_agino; 2321 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2322 int error; 2323 2324 trace_xfs_iunlink_remove(ip); 2325 2326 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2327 error = xfs_read_agi(mp, tp, agno, &agibp); 2328 if (error) { 2329 return error; 2330 } 2331 agi = agibp->b_addr; 2332 2333 /* 2334 * Get the index into the agi hash table for the list this inode will 2335 * go on. Make sure the head pointer isn't garbage. 2336 */ 2337 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2338 if (!xfs_verify_agino(mp, agno, head_agino)) { 2339 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); 2340 return -EFSCORRUPTED; 2341 } 2342 2343 /* 2344 * Set our inode's next_unlinked pointer to NULL and then return 2345 * the old pointer value so that we can update whatever was previous 2346 * to us in the list to point to whatever was next in the list. 2347 */ 2348 error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); 2349 if (error) { 2350 return error; 2351 } 2352 2353 /* 2354 * If there was a backref pointing from the next inode back to this 2355 * one, remove it because we've removed this inode from the list. 2356 * 2357 * Later, if this inode was in the middle of the list we'll update 2358 * this inode's backref to point from the next inode. 2359 */ 2360 if (next_agino != NULLAGINO) { 2361 error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); 2362 if (error) { 2363 return error; 2364 } 2365 } 2366 2367 if (head_agino != agino) { 2368 struct xfs_imap imap; 2369 xfs_agino_t prev_agino; 2370 2371 /* We need to search the list for the inode being freed. */ 2372 error = 2373 xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, agibp->b_pag); 2374 if (error) { 2375 return error; 2376 } 2377 2378 /* Point the previous inode on the list to the next inode. */ 2379 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, last_dip, &imap, next_agino); 2380 2381 /* 2382 * Now we deal with the backref for this inode. If this inode 2383 * pointed at a real inode, change the backref that pointed to 2384 * us to point to our old next. If this inode was the end of 2385 * the list, delete the backref that pointed to us. Note that 2386 * change_backref takes care of deleting the backref if 2387 * next_agino is NULLAGINO. 2388 */ 2389 return xfs_iunlink_change_backref(agibp->b_pag, agino, next_agino); 2390 } 2391 2392 /* Point the head of the list to the next unlinked inode. */ 2393 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, next_agino); 2394} 2395 2396/* 2397 * Look up the inode number specified and if it is not already marked XFS_ISTALE 2398 * mark it stale. We should only find clean inodes in this lookup that aren't 2399 * already stale. 2400 */ 2401static void xfs_ifree_mark_inode_stale(struct xfs_buf *bp, struct xfs_inode *free_ip, xfs_ino_t inum) 2402{ 2403 struct xfs_mount *mp = bp->b_mount; 2404 struct xfs_perag *pag = bp->b_pag; 2405 struct xfs_inode_log_item *iip; 2406 struct xfs_inode *ip; 2407 2408retry: 2409 rcu_read_lock(); 2410 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2411 /* Inode not in memory, nothing to do */ 2412 if (!ip) { 2413 rcu_read_unlock(); 2414 return; 2415 } 2416 2417 /* 2418 * because this is an RCU protected lookup, we could find a recently 2419 * freed or even reallocated inode during the lookup. We need to check 2420 * under the i_flags_lock for a valid inode here. Skip it if it is not 2421 * valid, the wrong inode or stale. 2422 */ 2423 spin_lock(&ip->i_flags_lock); 2424 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { 2425 goto out_iflags_unlock; 2426 } 2427 2428 /* 2429 * Don't try to lock/unlock the current inode, but we _cannot_ skip the 2430 * other inodes that we did not find in the list attached to the buffer 2431 * and are not already marked stale. If we can't lock it, back off and 2432 * retry. 2433 */ 2434 if (ip != free_ip) { 2435 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2436 spin_unlock(&ip->i_flags_lock); 2437 rcu_read_unlock(); 2438 delay(1); 2439 goto retry; 2440 } 2441 } 2442 ip->i_flags |= XFS_ISTALE; 2443 2444 /* 2445 * If the inode is flushing, it is already attached to the buffer. All 2446 * we needed to do here is mark the inode stale so buffer IO completion 2447 * will remove it from the AIL. 2448 */ 2449 iip = ip->i_itemp; 2450 if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { 2451 ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2452 ASSERT(iip->ili_last_fields); 2453 goto out_iunlock; 2454 } 2455 2456 /* 2457 * Inodes not attached to the buffer can be released immediately. 2458 * Everything else has to go through xfs_iflush_abort() on journal 2459 * commit as the flock synchronises removal of the inode from the 2460 * cluster buffer against inode reclaim. 2461 */ 2462 if (!iip || list_empty(&iip->ili_item.li_bio_list)) { 2463 goto out_iunlock; 2464 } 2465 2466 __xfs_iflags_set(ip, XFS_IFLUSHING); 2467 spin_unlock(&ip->i_flags_lock); 2468 rcu_read_unlock(); 2469 2470 /* we have a dirty inode in memory that has not yet been flushed. */ 2471 spin_lock(&iip->ili_lock); 2472 iip->ili_last_fields = iip->ili_fields; 2473 iip->ili_fields = 0; 2474 iip->ili_fsync_fields = 0; 2475 spin_unlock(&iip->ili_lock); 2476 ASSERT(iip->ili_last_fields); 2477 2478 if (ip != free_ip) { 2479 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2480 } 2481 return; 2482 2483out_iunlock: 2484 if (ip != free_ip) { 2485 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2486 } 2487out_iflags_unlock: 2488 spin_unlock(&ip->i_flags_lock); 2489 rcu_read_unlock(); 2490} 2491 2492/* 2493 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2494 * inodes that are in memory - they all must be marked stale and attached to 2495 * the cluster buffer. 2496 */ 2497STATIC int xfs_ifree_cluster(struct xfs_inode *free_ip, struct xfs_trans *tp, struct xfs_icluster *xic) 2498{ 2499 struct xfs_mount *mp = free_ip->i_mount; 2500 struct xfs_ino_geometry *igeo = M_IGEO(mp); 2501 struct xfs_buf *bp; 2502 xfs_daddr_t blkno; 2503 xfs_ino_t inum = xic->first_ino; 2504 int nbufs; 2505 int i, j; 2506 int ioffset; 2507 int error; 2508 2509 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2510 2511 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { 2512 /* 2513 * The allocation bitmap tells us which inodes of the chunk were 2514 * physically allocated. Skip the cluster if an inode falls into 2515 * a sparse region. 2516 */ 2517 ioffset = inum - xic->first_ino; 2518 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { 2519 ASSERT(ioffset % igeo->inodes_per_cluster == 0); 2520 continue; 2521 } 2522 2523 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); 2524 2525 /* 2526 * We obtain and lock the backing buffer first in the process 2527 * here to ensure dirty inodes attached to the buffer remain in 2528 * the flushing state while we mark them stale. 2529 * 2530 * If we scan the in-memory inodes first, then buffer IO can 2531 * complete before we get a lock on it, and hence we may fail 2532 * to mark all the active inodes on the buffer stale. 2533 */ 2534 error = 2535 xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED, &bp); 2536 if (error) { 2537 return error; 2538 } 2539 2540 /* 2541 * This buffer may not have been correctly initialised as we 2542 * didn't read it from disk. That's not important because we are 2543 * only using to mark the buffer as stale in the log, and to 2544 * attach stale cached inodes on it. That means it will never be 2545 * dispatched for IO. If it is, we want to know about it, and we 2546 * want it to fail. We can acheive this by adding a write 2547 * verifier to the buffer. 2548 */ 2549 bp->b_ops = &xfs_inode_buf_ops; 2550 2551 /* 2552 * Now we need to set all the cached clean inodes as XFS_ISTALE, 2553 * too. This requires lookups, and will skip inodes that we've 2554 * already marked XFS_ISTALE. 2555 */ 2556 for (i = 0; i < igeo->inodes_per_cluster; i++) { 2557 xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); 2558 } 2559 2560 xfs_trans_stale_inode_buf(tp, bp); 2561 xfs_trans_binval(tp, bp); 2562 } 2563 return 0; 2564} 2565 2566/* 2567 * This is called to return an inode to the inode free list. 2568 * The inode should already be truncated to 0 length and have 2569 * no pages associated with it. This routine also assumes that 2570 * the inode is already a part of the transaction. 2571 * 2572 * The on-disk copy of the inode will have been added to the list 2573 * of unlinked inodes in the AGI. We need to remove the inode from 2574 * that list atomically with respect to freeing it here. 2575 */ 2576int xfs_ifree(struct xfs_trans *tp, struct xfs_inode *ip) 2577{ 2578 int error; 2579 struct xfs_icluster xic = {0}; 2580 struct xfs_inode_log_item *iip = ip->i_itemp; 2581 2582 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2583 ASSERT(VFS_I(ip)->i_nlink == 0); 2584 ASSERT(ip->i_df.if_nextents == 0); 2585 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); 2586 ASSERT(ip->i_d.di_nblocks == 0); 2587 2588 /* 2589 * Pull the on-disk inode from the AGI unlinked list. 2590 */ 2591 error = xfs_iunlink_remove(tp, ip); 2592 if (error) { 2593 return error; 2594 } 2595 2596 error = xfs_difree(tp, ip->i_ino, &xic); 2597 if (error) { 2598 return error; 2599 } 2600 2601 /* 2602 * Free any local-format data sitting around before we reset the 2603 * data fork to extents format. Note that the attr fork data has 2604 * already been freed by xfs_attr_inactive. 2605 */ 2606 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 2607 kmem_free(ip->i_df.if_u1.if_data); 2608 ip->i_df.if_u1.if_data = NULL; 2609 ip->i_df.if_bytes = 0; 2610 } 2611 2612 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ 2613 ip->i_d.di_flags = 0; 2614 ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; 2615 ip->i_d.di_dmevmask = 0; 2616 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2617 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2618 2619 /* Don't attempt to replay owner changes for a deleted inode */ 2620 spin_lock(&iip->ili_lock); 2621 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2622 spin_unlock(&iip->ili_lock); 2623 2624 /* 2625 * Bump the generation count so no one will be confused 2626 * by reincarnations of this inode. 2627 */ 2628 VFS_I(ip)->i_generation++; 2629 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2630 2631 if (xic.deleted) { 2632 error = xfs_ifree_cluster(ip, tp, &xic); 2633 } 2634 2635 return error; 2636} 2637 2638/* 2639 * This is called to unpin an inode. The caller must have the inode locked 2640 * in at least shared mode so that the buffer cannot be subsequently pinned 2641 * once someone is waiting for it to be unpinned. 2642 */ 2643static void xfs_iunpin(struct xfs_inode *ip) 2644{ 2645 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); 2646 2647 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2648 2649 /* Give the log a push to start the unpinning I/O */ 2650 xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); 2651} 2652 2653static void _xfs_iunpin_wait(struct xfs_inode *ip) 2654{ 2655 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2656 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2657 2658 xfs_iunpin(ip); 2659 2660 do { 2661 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2662 if (xfs_ipincount(ip)) { 2663 io_schedule(); 2664 } 2665 } while (xfs_ipincount(ip)); 2666 finish_wait(wq, &wait.wq_entry); 2667} 2668 2669void xfs_iunpin_wait(struct xfs_inode *ip) 2670{ 2671 if (xfs_ipincount(ip)) { 2672 _xfs_iunpin_wait(ip); 2673 } 2674} 2675 2676/* 2677 * Removing an inode from the namespace involves removing the directory entry 2678 * and dropping the link count on the inode. Removing the directory entry can 2679 * result in locking an AGF (directory blocks were freed) and removing a link 2680 * count can result in placing the inode on an unlinked list which results in 2681 * locking an AGI. 2682 * 2683 * The big problem here is that we have an ordering constraint on AGF and AGI 2684 * locking - inode allocation locks the AGI, then can allocate a new extent for 2685 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode 2686 * removes the inode from the unlinked list, requiring that we lock the AGI 2687 * first, and then freeing the inode can result in an inode chunk being freed 2688 * and hence freeing disk space requiring that we lock an AGF. 2689 * 2690 * Hence the ordering that is imposed by other parts of the code is AGI before 2691 * AGF. This means we cannot remove the directory entry before we drop the inode 2692 * reference count and put it on the unlinked list as this results in a lock 2693 * order of AGF then AGI, and this can deadlock against inode allocation and 2694 * freeing. Therefore we must drop the link counts before we remove the 2695 * directory entry. 2696 * 2697 * This is still safe from a transactional point of view - it is not until we 2698 * get to xfs_defer_finish() that we have the possibility of multiple 2699 * transactions in this operation. Hence as long as we remove the directory 2700 * entry and drop the link count in the first transaction of the remove 2701 * operation, there are no transactional constraints on the ordering here. 2702 */ 2703int xfs_remove(xfs_inode_t *dp, struct xfs_name *name, xfs_inode_t *ip) 2704{ 2705 xfs_mount_t *mp = dp->i_mount; 2706 xfs_trans_t *tp = NULL; 2707 int is_dir = S_ISDIR(VFS_I(ip)->i_mode); 2708 int error = 0; 2709 uint resblks; 2710 2711 trace_xfs_remove(dp, name); 2712 2713 if (XFS_FORCED_SHUTDOWN(mp)) { 2714 return -EIO; 2715 } 2716 2717 error = xfs_qm_dqattach(dp); 2718 if (error) { 2719 goto std_return; 2720 } 2721 2722 error = xfs_qm_dqattach(ip); 2723 if (error) { 2724 goto std_return; 2725 } 2726 2727 /* 2728 * We try to get the real space reservation first, 2729 * allowing for directory btree deletion(s) implying 2730 * possible bmap insert(s). If we can't get the space 2731 * reservation then we use 0 instead, and avoid the bmap 2732 * btree insert(s) in the directory code by, if the bmap 2733 * insert tries to happen, instead trimming the LAST 2734 * block from the directory. 2735 */ 2736 resblks = XFS_REMOVE_SPACE_RES(mp); 2737 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); 2738 if (error == -ENOSPC) { 2739 resblks = 0; 2740 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, &tp); 2741 } 2742 if (error) { 2743 ASSERT(error != -ENOSPC); 2744 goto std_return; 2745 } 2746 2747 xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); 2748 2749 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2750 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2751 2752 /* 2753 * If we're removing a directory perform some additional validation. 2754 */ 2755 if (is_dir) { 2756 ASSERT(VFS_I(ip)->i_nlink >= 0x2); 2757 if (VFS_I(ip)->i_nlink != 0x2) { 2758 error = -ENOTEMPTY; 2759 goto out_trans_cancel; 2760 } 2761 if (!xfs_dir_isempty(ip)) { 2762 error = -ENOTEMPTY; 2763 goto out_trans_cancel; 2764 } 2765 2766 /* Drop the link from ip's "..". */ 2767 error = xfs_droplink(tp, dp); 2768 if (error) { 2769 goto out_trans_cancel; 2770 } 2771 2772 /* Drop the "." link from ip to self. */ 2773 error = xfs_droplink(tp, ip); 2774 if (error) { 2775 goto out_trans_cancel; 2776 } 2777 } else { 2778 /* 2779 * When removing a non-directory we need to log the parent 2780 * inode here. For a directory this is done implicitly 2781 * by the xfs_droplink call for the ".." entry. 2782 */ 2783 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2784 } 2785 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2786 2787 /* Drop the link from dp to ip. */ 2788 error = xfs_droplink(tp, ip); 2789 if (error) { 2790 goto out_trans_cancel; 2791 } 2792 2793 error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); 2794 if (error) { 2795 ASSERT(error != -ENOENT); 2796 goto out_trans_cancel; 2797 } 2798 2799 /* 2800 * If this is a synchronous mount, make sure that the 2801 * remove transaction goes to disk before returning to 2802 * the user. 2803 */ 2804 if (mp->m_flags & (XFS_MOUNT_WSYNC | XFS_MOUNT_DIRSYNC)) { 2805 xfs_trans_set_sync(tp); 2806 } 2807 2808 error = xfs_trans_commit(tp); 2809 if (error) { 2810 goto std_return; 2811 } 2812 2813 if (is_dir && xfs_inode_is_filestream(ip)) { 2814 xfs_filestream_deassociate(ip); 2815 } 2816 2817 return 0; 2818 2819out_trans_cancel: 2820 xfs_trans_cancel(tp); 2821std_return: 2822 return error; 2823} 2824 2825/* 2826 * Enter all inodes for a rename transaction into a sorted array. 2827 */ 2828#define _XFS_SORT_INODES 5 2829STATIC void xfs_sort_for_rename(struct xfs_inode *dp1, /* in: old (source) directory inode */ 2830 struct xfs_inode *dp2, /* in: new (target) directory inode */ 2831 struct xfs_inode *ip1, /* in: inode of old entry */ 2832 struct xfs_inode *ip2, /* in: inode of new entry */ 2833 struct xfs_inode *wip, /* in: whiteout inode */ 2834 struct xfs_inode **i_tab, /* out: sorted array of inodes */ 2835 int *num_inodes) /* in/out: inodes in array */ 2836{ 2837 int i, j; 2838 2839 ASSERT(*num_inodes == _XFS_SORT_INODES); 2840 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); 2841 2842 /* 2843 * i_tab contains a list of pointers to inodes. We initialize 2844 * the table here & we'll sort it. We will then use it to 2845 * order the acquisition of the inode locks. 2846 * 2847 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2848 */ 2849 i = 0; 2850 i_tab[i++] = dp1; 2851 i_tab[i++] = dp2; 2852 i_tab[i++] = ip1; 2853 if (ip2) { 2854 i_tab[i++] = ip2; 2855 } 2856 if (wip) { 2857 i_tab[i++] = wip; 2858 } 2859 *num_inodes = i; 2860 2861 /* 2862 * Sort the elements via bubble sort. (Remember, there are at 2863 * most 5 elements to sort, so this is adequate.) 2864 */ 2865 for (i = 0; i < *num_inodes; i++) { 2866 for (j = 1; j < *num_inodes; j++) { 2867 if (i_tab[j]->i_ino < i_tab[j - 1]->i_ino) { 2868 struct xfs_inode *temp = i_tab[j]; 2869 i_tab[j] = i_tab[j - 1]; 2870 i_tab[j - 1] = temp; 2871 } 2872 } 2873 } 2874} 2875 2876static int xfs_finish_rename(struct xfs_trans *tp) 2877{ 2878 /* 2879 * If this is a synchronous mount, make sure that the rename transaction 2880 * goes to disk before returning to the user. 2881 */ 2882 if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC | XFS_MOUNT_DIRSYNC)) { 2883 xfs_trans_set_sync(tp); 2884 } 2885 2886 return xfs_trans_commit(tp); 2887} 2888 2889/* 2890 * xfs_cross_rename() 2891 * 2892 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall 2893 */ 2894STATIC int xfs_cross_rename(struct xfs_trans *tp, struct xfs_inode *dp1, struct xfs_name *name1, struct xfs_inode *ip1, 2895 struct xfs_inode *dp2, struct xfs_name *name2, struct xfs_inode *ip2, int spaceres) 2896{ 2897 int error = 0; 2898 int ip1_flags = 0; 2899 int ip2_flags = 0; 2900 int dp2_flags = 0; 2901 2902 /* Swap inode number for dirent in first parent */ 2903 error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); 2904 if (error) { 2905 goto out_trans_abort; 2906 } 2907 2908 /* Swap inode number for dirent in second parent */ 2909 error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); 2910 if (error) { 2911 goto out_trans_abort; 2912 } 2913 2914 /* 2915 * If we're renaming one or more directories across different parents, 2916 * update the respective ".." entries (and link counts) to match the new 2917 * parents. 2918 */ 2919 if (dp1 != dp2) { 2920 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2921 2922 if (S_ISDIR(VFS_I(ip2)->i_mode)) { 2923 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, dp1->i_ino, spaceres); 2924 if (error) { 2925 goto out_trans_abort; 2926 } 2927 2928 /* transfer ip2 ".." reference to dp1 */ 2929 if (!S_ISDIR(VFS_I(ip1)->i_mode)) { 2930 error = xfs_droplink(tp, dp2); 2931 if (error) { 2932 goto out_trans_abort; 2933 } 2934 xfs_bumplink(tp, dp1); 2935 } 2936 2937 /* 2938 * Although ip1 isn't changed here, userspace needs 2939 * to be warned about the change, so that applications 2940 * relying on it (like backup ones), will properly 2941 * notify the change 2942 */ 2943 ip1_flags |= XFS_ICHGTIME_CHG; 2944 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2945 } 2946 2947 if (S_ISDIR(VFS_I(ip1)->i_mode)) { 2948 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, dp2->i_ino, spaceres); 2949 if (error) { 2950 goto out_trans_abort; 2951 } 2952 2953 /* transfer ip1 ".." reference to dp2 */ 2954 if (!S_ISDIR(VFS_I(ip2)->i_mode)) { 2955 error = xfs_droplink(tp, dp1); 2956 if (error) { 2957 goto out_trans_abort; 2958 } 2959 xfs_bumplink(tp, dp2); 2960 } 2961 2962 /* 2963 * Although ip2 isn't changed here, userspace needs 2964 * to be warned about the change, so that applications 2965 * relying on it (like backup ones), will properly 2966 * notify the change 2967 */ 2968 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2969 ip2_flags |= XFS_ICHGTIME_CHG; 2970 } 2971 } 2972 2973 if (ip1_flags) { 2974 xfs_trans_ichgtime(tp, ip1, ip1_flags); 2975 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); 2976 } 2977 if (ip2_flags) { 2978 xfs_trans_ichgtime(tp, ip2, ip2_flags); 2979 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); 2980 } 2981 if (dp2_flags) { 2982 xfs_trans_ichgtime(tp, dp2, dp2_flags); 2983 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); 2984 } 2985 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2986 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2987 return xfs_finish_rename(tp); 2988 2989out_trans_abort: 2990 xfs_trans_cancel(tp); 2991 return error; 2992} 2993 2994/* 2995 * xfs_rename_alloc_whiteout() 2996 * 2997 * Return a referenced, unlinked, unlocked inode that can be used as a 2998 * whiteout in a rename transaction. We use a tmpfile inode here so that if we 2999 * crash between allocating the inode and linking it into the rename transaction 3000 * recovery will free the inode and we won't leak it. 3001 */ 3002static int xfs_rename_alloc_whiteout(struct xfs_inode *dp, struct xfs_inode **wip) 3003{ 3004 struct xfs_inode *tmpfile; 3005 int error; 3006 3007 error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); 3008 if (error) { 3009 return error; 3010 } 3011 3012 /* 3013 * Prepare the tmpfile inode as if it were created through the VFS. 3014 * Complete the inode setup and flag it as linkable. nlink is already 3015 * zero, so we can skip the drop_nlink. 3016 */ 3017 xfs_setup_iops(tmpfile); 3018 xfs_finish_inode_setup(tmpfile); 3019 VFS_I(tmpfile)->i_state |= I_LINKABLE; 3020 3021 *wip = tmpfile; 3022 return 0; 3023} 3024 3025/* 3026 * xfs_rename 3027 */ 3028int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, 3029 struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip, 3030 unsigned int flags) 3031{ 3032 struct xfs_mount *mp = src_dp->i_mount; 3033 struct xfs_trans *tp; 3034 struct xfs_inode *wip = NULL; /* whiteout inode */ 3035 struct xfs_inode *inodes[_XFS_SORT_INODES]; 3036 int i; 3037 int num_inodes = _XFS_SORT_INODES; 3038 bool new_parent = (src_dp != target_dp); 3039 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); 3040 int spaceres; 3041 int error; 3042 3043 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 3044 3045 if ((flags & RENAME_EXCHANGE) && !target_ip) { 3046 return -EINVAL; 3047 } 3048 3049 /* 3050 * If we are doing a whiteout operation, allocate the whiteout inode 3051 * we will be placing at the target and ensure the type is set 3052 * appropriately. 3053 */ 3054 if (flags & RENAME_WHITEOUT) { 3055 error = xfs_rename_alloc_whiteout(target_dp, &wip); 3056 if (error) { 3057 return error; 3058 } 3059 3060 /* setup target dirent info as whiteout */ 3061 src_name->type = XFS_DIR3_FT_CHRDEV; 3062 } 3063 3064 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); 3065 3066 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 3067 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); 3068 if (error == -ENOSPC) { 3069 spaceres = 0; 3070 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, &tp); 3071 } 3072 if (error) { 3073 goto out_release_wip; 3074 } 3075 3076 /* 3077 * Attach the dquots to the inodes 3078 */ 3079 error = xfs_qm_vop_rename_dqattach(inodes); 3080 if (error) { 3081 goto out_trans_cancel; 3082 } 3083 3084 /* 3085 * Lock all the participating inodes. Depending upon whether 3086 * the target_name exists in the target directory, and 3087 * whether the target directory is the same as the source 3088 * directory, we can lock from 2 to 4 inodes. 3089 */ 3090 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 3091 3092 /* 3093 * Join all the inodes to the transaction. From this point on, 3094 * we can rely on either trans_commit or trans_cancel to unlock 3095 * them. 3096 */ 3097 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 3098 if (new_parent) { 3099 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 3100 } 3101 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 3102 if (target_ip) { 3103 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 3104 } 3105 if (wip) { 3106 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); 3107 } 3108 3109 /* 3110 * If we are using project inheritance, we only allow renames 3111 * into our tree when the project IDs are the same; else the 3112 * tree quota mechanism would be circumvented. 3113 */ 3114 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 3115 target_dp->i_d.di_projid != src_ip->i_d.di_projid)) { 3116 error = -EXDEV; 3117 goto out_trans_cancel; 3118 } 3119 3120 /* RENAME_EXCHANGE is unique from here on. */ 3121 if (flags & RENAME_EXCHANGE) { 3122 return xfs_cross_rename(tp, src_dp, src_name, src_ip, target_dp, target_name, target_ip, spaceres); 3123 } 3124 3125 /* 3126 * Check for expected errors before we dirty the transaction 3127 * so we can return an error without a transaction abort. 3128 */ 3129 if (target_ip == NULL) { 3130 /* 3131 * If there's no space reservation, check the entry will 3132 * fit before actually inserting it. 3133 */ 3134 if (!spaceres) { 3135 error = xfs_dir_canenter(tp, target_dp, target_name); 3136 if (error) { 3137 goto out_trans_cancel; 3138 } 3139 } 3140 } else { 3141 /* 3142 * If target exists and it's a directory, check that whether 3143 * it can be destroyed. 3144 */ 3145 if (S_ISDIR(VFS_I(target_ip)->i_mode) && (!xfs_dir_isempty(target_ip) || (VFS_I(target_ip)->i_nlink > 0x2))) { 3146 error = -EEXIST; 3147 goto out_trans_cancel; 3148 } 3149 } 3150 3151 /* 3152 * Lock the AGI buffers we need to handle bumping the nlink of the 3153 * whiteout inode off the unlinked list and to handle dropping the 3154 * nlink of the target inode. Per locking order rules, do this in 3155 * increasing AG order and before directory block allocation tries to 3156 * grab AGFs because we grab AGIs before AGFs. 3157 * 3158 * The (vfs) caller must ensure that if src is a directory then 3159 * target_ip is either null or an empty directory. 3160 */ 3161 for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { 3162 if (inodes[i] == wip || 3163 (inodes[i] == target_ip && 3164 (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { 3165 struct xfs_buf *bp; 3166 xfs_agnumber_t agno; 3167 3168 agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); 3169 error = xfs_read_agi(mp, tp, agno, &bp); 3170 if (error) 3171 goto out_trans_cancel; 3172 } 3173 } 3174 3175 /* 3176 * Directory entry creation below may acquire the AGF. Remove 3177 * the whiteout from the unlinked list first to preserve correct 3178 * AGI/AGF locking order. This dirties the transaction so failures 3179 * after this point will abort and log recovery will clean up the 3180 * mess. 3181 * 3182 * For whiteouts, we need to bump the link count on the whiteout 3183 * inode. After this point, we have a real link, clear the tmpfile 3184 * state flag from the inode so it doesn't accidentally get misused 3185 * in future. 3186 */ 3187 if (wip) { 3188 ASSERT(VFS_I(wip)->i_nlink == 0); 3189 error = xfs_iunlink_remove(tp, wip); 3190 if (error) { 3191 goto out_trans_cancel; 3192 } 3193 3194 xfs_bumplink(tp, wip); 3195 VFS_I(wip)->i_state &= ~I_LINKABLE; 3196 } 3197 3198 /* 3199 * Set up the target. 3200 */ 3201 if (target_ip == NULL) { 3202 /* 3203 * If target does not exist and the rename crosses 3204 * directories, adjust the target directory link count 3205 * to account for the ".." reference from the new entry. 3206 */ 3207 error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, spaceres); 3208 if (error) { 3209 goto out_trans_cancel; 3210 } 3211 3212 xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3213 3214 if (new_parent && src_is_directory) { 3215 xfs_bumplink(tp, target_dp); 3216 } 3217 } else { /* target_ip != NULL */ 3218 /* 3219 * Link the source inode under the target name. 3220 * If the source inode is a directory and we are moving 3221 * it across directories, its ".." entry will be 3222 * inconsistent until we replace that down below. 3223 * 3224 * In case there is already an entry with the same 3225 * name at the destination directory, remove it first. 3226 */ 3227 3228 error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); 3229 if (error) { 3230 goto out_trans_cancel; 3231 } 3232 3233 xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3234 3235 /* 3236 * Decrement the link count on the target since the target 3237 * dir no longer points to it. 3238 */ 3239 error = xfs_droplink(tp, target_ip); 3240 if (error) { 3241 goto out_trans_cancel; 3242 } 3243 3244 if (src_is_directory) { 3245 /* 3246 * Drop the link from the old "." entry. 3247 */ 3248 error = xfs_droplink(tp, target_ip); 3249 if (error) { 3250 goto out_trans_cancel; 3251 } 3252 } 3253 } /* target_ip != NULL */ 3254 3255 /* 3256 * Remove the source. 3257 */ 3258 if (new_parent && src_is_directory) { 3259 /* 3260 * Rewrite the ".." entry to point to the new 3261 * directory. 3262 */ 3263 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, target_dp->i_ino, spaceres); 3264 ASSERT(error != -EEXIST); 3265 if (error) { 3266 goto out_trans_cancel; 3267 } 3268 } 3269 3270 /* 3271 * We always want to hit the ctime on the source inode. 3272 * 3273 * This isn't strictly required by the standards since the source 3274 * inode isn't really being changed, but old unix file systems did 3275 * it and some incremental backup programs won't work without it. 3276 */ 3277 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 3278 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 3279 3280 /* 3281 * Adjust the link count on src_dp. This is necessary when 3282 * renaming a directory, either within one parent when 3283 * the target existed, or across two parent directories. 3284 */ 3285 if (src_is_directory && (new_parent || target_ip != NULL)) { 3286 /* 3287 * Decrement link count on src_directory since the 3288 * entry that's moved no longer points to it. 3289 */ 3290 error = xfs_droplink(tp, src_dp); 3291 if (error) { 3292 goto out_trans_cancel; 3293 } 3294 } 3295 3296 /* 3297 * For whiteouts, we only need to update the source dirent with the 3298 * inode number of the whiteout inode rather than removing it 3299 * altogether. 3300 */ 3301 if (wip) { 3302 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); 3303 } else { 3304 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); 3305 } 3306 if (error) { 3307 goto out_trans_cancel; 3308 } 3309 3310 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3311 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3312 if (new_parent) { 3313 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3314 } 3315 3316 error = xfs_finish_rename(tp); 3317 if (wip) { 3318 xfs_irele(wip); 3319 } 3320 return error; 3321 3322out_trans_cancel: 3323 xfs_trans_cancel(tp); 3324out_release_wip: 3325 if (wip) { 3326 xfs_irele(wip); 3327 } 3328 return error; 3329} 3330 3331static int xfs_iflush(struct xfs_inode *ip, struct xfs_buf *bp) 3332{ 3333 struct xfs_inode_log_item *iip = ip->i_itemp; 3334 struct xfs_dinode *dip; 3335 struct xfs_mount *mp = ip->i_mount; 3336 int error; 3337 3338 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); 3339 ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); 3340 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3341 ASSERT(iip->ili_item.li_buf == bp); 3342 3343 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3344 3345 /* 3346 * We don't flush the inode if any of the following checks fail, but we 3347 * do still update the log item and attach to the backing buffer as if 3348 * the flush happened. This is a formality to facilitate predictable 3349 * error handling as the caller will shutdown and fail the buffer. 3350 */ 3351 error = -EFSCORRUPTED; 3352 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { 3353 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr " PTR_FMT, __func__, ip->i_ino, 3354 be16_to_cpu(dip->di_magic), dip); 3355 goto flush_out; 3356 } 3357 if (S_ISREG(VFS_I(ip)->i_mode)) { 3358 if (XFS_TEST_ERROR(ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && ip->i_df.if_format != XFS_DINODE_FMT_BTREE, 3359 mp, XFS_ERRTAG_IFLUSH_3)) { 3360 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr " PTR_FMT, __func__, ip->i_ino, ip); 3361 goto flush_out; 3362 } 3363 } else if (S_ISDIR(VFS_I(ip)->i_mode)) { 3364 if (XFS_TEST_ERROR(ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && ip->i_df.if_format != XFS_DINODE_FMT_BTREE && 3365 ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, 3366 mp, XFS_ERRTAG_IFLUSH_4)) { 3367 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr " PTR_FMT, __func__, ip->i_ino, ip); 3368 goto flush_out; 3369 } 3370 } 3371 if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, 3372 XFS_ERRTAG_IFLUSH_5)) { 3373 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3374 "%s: detected corrupt incore inode %Lu, " 3375 "total extents = %d, nblocks = %Ld, ptr " PTR_FMT, 3376 __func__, ip->i_ino, ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, 3377 ip); 3378 goto flush_out; 3379 } 3380 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { 3381 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr " PTR_FMT, __func__, ip->i_ino, 3382 ip->i_d.di_forkoff, ip); 3383 goto flush_out; 3384 } 3385 3386 /* 3387 * Inode item log recovery for v2 inodes are dependent on the 3388 * di_flushiter count for correct sequencing. We bump the flush 3389 * iteration count so we can detect flushes which postdate a log record 3390 * during recovery. This is redundant as we now log every change and 3391 * hence this can't happen but we need to still do it to ensure 3392 * backwards compatibility with old kernels that predate logging all 3393 * inode changes. 3394 */ 3395 if (!xfs_sb_version_has_v3inode(&mp->m_sb)) { 3396 ip->i_d.di_flushiter++; 3397 } 3398 3399 /* 3400 * If there are inline format data / attr forks attached to this inode, 3401 * make sure they are not corrupt. 3402 */ 3403 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_data(ip)) { 3404 goto flush_out; 3405 } 3406 if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_attr(ip)) { 3407 goto flush_out; 3408 } 3409 3410 /* 3411 * Copy the dirty parts of the inode into the on-disk inode. We always 3412 * copy out the core of the inode, because if the inode is dirty at all 3413 * the core must be. 3414 */ 3415 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); 3416 3417 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3418 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) { 3419 ip->i_d.di_flushiter = 0; 3420 } 3421 3422 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3423 if (XFS_IFORK_Q(ip)) { 3424 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3425 } 3426 3427 /* 3428 * We've recorded everything logged in the inode, so we'd like to clear 3429 * the ili_fields bits so we don't log and flush things unnecessarily. 3430 * However, we can't stop logging all this information until the data 3431 * we've copied into the disk buffer is written to disk. If we did we 3432 * might overwrite the copy of the inode in the log with all the data 3433 * after re-logging only part of it, and in the face of a crash we 3434 * wouldn't have all the data we need to recover. 3435 * 3436 * What we do is move the bits to the ili_last_fields field. When 3437 * logging the inode, these bits are moved back to the ili_fields field. 3438 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since 3439 * we know that the information those bits represent is permanently on 3440 * disk. As long as the flush completes before the inode is logged 3441 * again, then both ili_fields and ili_last_fields will be cleared. 3442 */ 3443 error = 0; 3444flush_out: 3445 spin_lock(&iip->ili_lock); 3446 iip->ili_last_fields = iip->ili_fields; 3447 iip->ili_fields = 0; 3448 iip->ili_fsync_fields = 0; 3449 spin_unlock(&iip->ili_lock); 3450 3451 /* 3452 * Store the current LSN of the inode so that we can tell whether the 3453 * item has moved in the AIL from xfs_buf_inode_iodone(). 3454 */ 3455 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); 3456 3457 /* generate the checksum. */ 3458 xfs_dinode_calc_crc(mp, dip); 3459 return error; 3460} 3461 3462/* 3463 * Non-blocking flush of dirty inode metadata into the backing buffer. 3464 * 3465 * The caller must have a reference to the inode and hold the cluster buffer 3466 * locked. The function will walk across all the inodes on the cluster buffer it 3467 * can find and lock without blocking, and flush them to the cluster buffer. 3468 * 3469 * On successful flushing of at least one inode, the caller must write out the 3470 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3471 * the caller needs to release the buffer. On failure, the filesystem will be 3472 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3473 * will be returned. 3474 */ 3475int xfs_iflush_cluster(struct xfs_buf *bp) 3476{ 3477 struct xfs_mount *mp = bp->b_mount; 3478 struct xfs_log_item *lip, *n; 3479 struct xfs_inode *ip; 3480 struct xfs_inode_log_item *iip; 3481 int clcount = 0; 3482 int error = 0; 3483 3484 /* 3485 * We must use the safe variant here as on shutdown xfs_iflush_abort() 3486 * can remove itself from the list. 3487 */ 3488 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) 3489 { 3490 iip = (struct xfs_inode_log_item *)lip; 3491 ip = iip->ili_inode; 3492 3493 /* 3494 * Quick and dirty check to avoid locks if possible. 3495 */ 3496 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3497 continue; 3498 } 3499 if (xfs_ipincount(ip)) { 3500 continue; 3501 } 3502 3503 /* 3504 * The inode is still attached to the buffer, which means it is 3505 * dirty but reclaim might try to grab it. Check carefully for 3506 * that, and grab the ilock while still holding the i_flags_lock 3507 * to guarantee reclaim will not be able to reclaim this inode 3508 * once we drop the i_flags_lock. 3509 */ 3510 spin_lock(&ip->i_flags_lock); 3511 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3512 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3513 spin_unlock(&ip->i_flags_lock); 3514 continue; 3515 } 3516 3517 /* 3518 * ILOCK will pin the inode against reclaim and prevent 3519 * concurrent transactions modifying the inode while we are 3520 * flushing the inode. If we get the lock, set the flushing 3521 * state before we drop the i_flags_lock. 3522 */ 3523 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3524 spin_unlock(&ip->i_flags_lock); 3525 continue; 3526 } 3527 __xfs_iflags_set(ip, XFS_IFLUSHING); 3528 spin_unlock(&ip->i_flags_lock); 3529 3530 /* 3531 * Abort flushing this inode if we are shut down because the 3532 * inode may not currently be in the AIL. This can occur when 3533 * log I/O failure unpins the inode without inserting into the 3534 * AIL, leaving a dirty/unpinned inode attached to the buffer 3535 * that otherwise looks like it should be flushed. 3536 */ 3537 if (XFS_FORCED_SHUTDOWN(mp)) { 3538 xfs_iunpin_wait(ip); 3539 xfs_iflush_abort(ip); 3540 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3541 error = -EIO; 3542 continue; 3543 } 3544 3545 /* don't block waiting on a log force to unpin dirty inodes */ 3546 if (xfs_ipincount(ip)) { 3547 xfs_iflags_clear(ip, XFS_IFLUSHING); 3548 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3549 continue; 3550 } 3551 3552 if (!xfs_inode_clean(ip)) { 3553 error = xfs_iflush(ip, bp); 3554 } else { 3555 xfs_iflags_clear(ip, XFS_IFLUSHING); 3556 } 3557 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3558 if (error) { 3559 break; 3560 } 3561 clcount++; 3562 } 3563 3564 if (error) { 3565 bp->b_flags |= XBF_ASYNC; 3566 xfs_buf_ioend_fail(bp); 3567 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3568 return error; 3569 } 3570 3571 if (!clcount) { 3572 return -EAGAIN; 3573 } 3574 3575 XFS_STATS_INC(mp, xs_icluster_flushcnt); 3576 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3577 return 0; 3578} 3579 3580/* Release an inode. */ 3581void xfs_irele(struct xfs_inode *ip) 3582{ 3583 trace_xfs_irele(ip, _RET_IP_); 3584 iput(VFS_I(ip)); 3585} 3586 3587/* 3588 * Ensure all commited transactions touching the inode are written to the log. 3589 */ 3590int xfs_log_force_inode(struct xfs_inode *ip) 3591{ 3592 xfs_csn_t seq = 0; 3593 3594 xfs_ilock(ip, XFS_ILOCK_SHARED); 3595 if (xfs_ipincount(ip)) { 3596 seq = ip->i_itemp->ili_commit_seq; 3597 } 3598 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3599 3600 if (!seq) { 3601 return 0; 3602 } 3603 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); 3604} 3605 3606/* 3607 * Grab the exclusive iolock for a data copy from src to dest, making sure to 3608 * abide vfs locking order (lowest pointer value goes first) and breaking the 3609 * layout leases before proceeding. The loop is needed because we cannot call 3610 * the blocking break_layout() with the iolocks held, and therefore have to 3611 * back out both locks. 3612 */ 3613static int xfs_iolock_two_inodes_and_break_layout(struct inode *src, struct inode *dest) 3614{ 3615 int error; 3616 3617 if (src > dest) { 3618 swap(src, dest); 3619 } 3620 3621 while (1) { 3622 /* Wait to break both inodes' layouts before we start locking. */ 3623 error = break_layout(src, true); 3624 if (error) { 3625 return error; 3626 } 3627 if (src != dest) { 3628 error = break_layout(dest, true); 3629 if (error) { 3630 return error; 3631 } 3632 } 3633 3634 /* Lock one inode and make sure nobody got in and leased it. */ 3635 inode_lock(src); 3636 error = break_layout(src, false); 3637 if (error) { 3638 inode_unlock(src); 3639 if (error == -EWOULDBLOCK) { 3640 continue; 3641 } 3642 return error; 3643 } 3644 3645 if (src == dest) { 3646 return 0; 3647 } 3648 3649 /* Lock the other inode and make sure nobody got in and leased it. */ 3650 inode_lock_nested(dest, I_MUTEX_NONDIR2); 3651 error = break_layout(dest, false); 3652 if (error) { 3653 inode_unlock(src); 3654 inode_unlock(dest); 3655 if (error == -EWOULDBLOCK) { 3656 continue; 3657 } 3658 return error; 3659 } 3660 break; 3661 } 3662 3663 return 0; 3664} 3665 3666/* 3667 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3668 * mmap activity. 3669 */ 3670int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2) 3671{ 3672 int ret; 3673 3674 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3675 if (ret) { 3676 return ret; 3677 } 3678 if (ip1 == ip2) { 3679 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3680 } else { 3681 xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, ip2, XFS_MMAPLOCK_EXCL); 3682 } 3683 return 0; 3684} 3685 3686/* Unlock both inodes to allow IO and mmap activity. */ 3687void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2) 3688{ 3689 bool same_inode = (ip1 == ip2); 3690 3691 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3692 if (!same_inode) { 3693 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3694 } 3695 inode_unlock(VFS_I(ip2)); 3696 if (!same_inode) { 3697 inode_unlock(VFS_I(ip1)); 3698 } 3699} 3700