xref: /kernel/linux/linux-5.10/fs/xfs/xfs_inode.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include <linux/iversion.h>
7
8#include "xfs.h"
9#include "xfs_fs.h"
10#include "xfs_shared.h"
11#include "xfs_format.h"
12#include "xfs_log_format.h"
13#include "xfs_trans_resv.h"
14#include "xfs_sb.h"
15#include "xfs_mount.h"
16#include "xfs_defer.h"
17#include "xfs_inode.h"
18#include "xfs_dir2.h"
19#include "xfs_attr.h"
20#include "xfs_trans_space.h"
21#include "xfs_trans.h"
22#include "xfs_buf_item.h"
23#include "xfs_inode_item.h"
24#include "xfs_ialloc.h"
25#include "xfs_bmap.h"
26#include "xfs_bmap_util.h"
27#include "xfs_errortag.h"
28#include "xfs_error.h"
29#include "xfs_quota.h"
30#include "xfs_filestream.h"
31#include "xfs_trace.h"
32#include "xfs_icache.h"
33#include "xfs_symlink.h"
34#include "xfs_trans_priv.h"
35#include "xfs_log.h"
36#include "xfs_bmap_btree.h"
37#include "xfs_reflink.h"
38
39kmem_zone_t *xfs_inode_zone;
40
41/*
42 * Used in xfs_itruncate_extents().  This is the maximum number of extents
43 * freed from a file in a single transaction.
44 */
45#define	XFS_ITRUNC_MAX_EXTENTS	2
46
47STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
48STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
49
50/*
51 * helper function to extract extent size hint from inode
52 */
53xfs_extlen_t
54xfs_get_extsz_hint(
55	struct xfs_inode	*ip)
56{
57	/*
58	 * No point in aligning allocations if we need to COW to actually
59	 * write to them.
60	 */
61	if (xfs_is_always_cow_inode(ip))
62		return 0;
63	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
64		return ip->i_d.di_extsize;
65	if (XFS_IS_REALTIME_INODE(ip))
66		return ip->i_mount->m_sb.sb_rextsize;
67	return 0;
68}
69
70/*
71 * Helper function to extract CoW extent size hint from inode.
72 * Between the extent size hint and the CoW extent size hint, we
73 * return the greater of the two.  If the value is zero (automatic),
74 * use the default size.
75 */
76xfs_extlen_t
77xfs_get_cowextsz_hint(
78	struct xfs_inode	*ip)
79{
80	xfs_extlen_t		a, b;
81
82	a = 0;
83	if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
84		a = ip->i_d.di_cowextsize;
85	b = xfs_get_extsz_hint(ip);
86
87	a = max(a, b);
88	if (a == 0)
89		return XFS_DEFAULT_COWEXTSZ_HINT;
90	return a;
91}
92
93/*
94 * These two are wrapper routines around the xfs_ilock() routine used to
95 * centralize some grungy code.  They are used in places that wish to lock the
96 * inode solely for reading the extents.  The reason these places can't just
97 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
98 * bringing in of the extents from disk for a file in b-tree format.  If the
99 * inode is in b-tree format, then we need to lock the inode exclusively until
100 * the extents are read in.  Locking it exclusively all the time would limit
101 * our parallelism unnecessarily, though.  What we do instead is check to see
102 * if the extents have been read in yet, and only lock the inode exclusively
103 * if they have not.
104 *
105 * The functions return a value which should be given to the corresponding
106 * xfs_iunlock() call.
107 */
108uint
109xfs_ilock_data_map_shared(
110	struct xfs_inode	*ip)
111{
112	uint			lock_mode = XFS_ILOCK_SHARED;
113
114	if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE &&
115	    (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
116		lock_mode = XFS_ILOCK_EXCL;
117	xfs_ilock(ip, lock_mode);
118	return lock_mode;
119}
120
121uint
122xfs_ilock_attr_map_shared(
123	struct xfs_inode	*ip)
124{
125	uint			lock_mode = XFS_ILOCK_SHARED;
126
127	if (ip->i_afp &&
128	    ip->i_afp->if_format == XFS_DINODE_FMT_BTREE &&
129	    (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
130		lock_mode = XFS_ILOCK_EXCL;
131	xfs_ilock(ip, lock_mode);
132	return lock_mode;
133}
134
135/*
136 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
137 * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
138 * various combinations of the locks to be obtained.
139 *
140 * The 3 locks should always be ordered so that the IO lock is obtained first,
141 * the mmap lock second and the ilock last in order to prevent deadlock.
142 *
143 * Basic locking order:
144 *
145 * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
146 *
147 * mmap_lock locking order:
148 *
149 * i_rwsem -> page lock -> mmap_lock
150 * mmap_lock -> i_mmap_lock -> page_lock
151 *
152 * The difference in mmap_lock locking order mean that we cannot hold the
153 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
154 * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
155 * in get_user_pages() to map the user pages into the kernel address space for
156 * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
157 * page faults already hold the mmap_lock.
158 *
159 * Hence to serialise fully against both syscall and mmap based IO, we need to
160 * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
161 * taken in places where we need to invalidate the page cache in a race
162 * free manner (e.g. truncate, hole punch and other extent manipulation
163 * functions).
164 */
165void
166xfs_ilock(
167	xfs_inode_t		*ip,
168	uint			lock_flags)
169{
170	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
171
172	/*
173	 * You can't set both SHARED and EXCL for the same lock,
174	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
175	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
176	 */
177	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
178	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
179	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
180	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
181	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
182	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
183	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
184
185	if (lock_flags & XFS_IOLOCK_EXCL) {
186		down_write_nested(&VFS_I(ip)->i_rwsem,
187				  XFS_IOLOCK_DEP(lock_flags));
188	} else if (lock_flags & XFS_IOLOCK_SHARED) {
189		down_read_nested(&VFS_I(ip)->i_rwsem,
190				 XFS_IOLOCK_DEP(lock_flags));
191	}
192
193	if (lock_flags & XFS_MMAPLOCK_EXCL)
194		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
195	else if (lock_flags & XFS_MMAPLOCK_SHARED)
196		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
197
198	if (lock_flags & XFS_ILOCK_EXCL)
199		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
200	else if (lock_flags & XFS_ILOCK_SHARED)
201		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
202}
203
204/*
205 * This is just like xfs_ilock(), except that the caller
206 * is guaranteed not to sleep.  It returns 1 if it gets
207 * the requested locks and 0 otherwise.  If the IO lock is
208 * obtained but the inode lock cannot be, then the IO lock
209 * is dropped before returning.
210 *
211 * ip -- the inode being locked
212 * lock_flags -- this parameter indicates the inode's locks to be
213 *       to be locked.  See the comment for xfs_ilock() for a list
214 *	 of valid values.
215 */
216int
217xfs_ilock_nowait(
218	xfs_inode_t		*ip,
219	uint			lock_flags)
220{
221	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
222
223	/*
224	 * You can't set both SHARED and EXCL for the same lock,
225	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
226	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
227	 */
228	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
229	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
230	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
231	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
232	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
233	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
234	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
235
236	if (lock_flags & XFS_IOLOCK_EXCL) {
237		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
238			goto out;
239	} else if (lock_flags & XFS_IOLOCK_SHARED) {
240		if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
241			goto out;
242	}
243
244	if (lock_flags & XFS_MMAPLOCK_EXCL) {
245		if (!mrtryupdate(&ip->i_mmaplock))
246			goto out_undo_iolock;
247	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
248		if (!mrtryaccess(&ip->i_mmaplock))
249			goto out_undo_iolock;
250	}
251
252	if (lock_flags & XFS_ILOCK_EXCL) {
253		if (!mrtryupdate(&ip->i_lock))
254			goto out_undo_mmaplock;
255	} else if (lock_flags & XFS_ILOCK_SHARED) {
256		if (!mrtryaccess(&ip->i_lock))
257			goto out_undo_mmaplock;
258	}
259	return 1;
260
261out_undo_mmaplock:
262	if (lock_flags & XFS_MMAPLOCK_EXCL)
263		mrunlock_excl(&ip->i_mmaplock);
264	else if (lock_flags & XFS_MMAPLOCK_SHARED)
265		mrunlock_shared(&ip->i_mmaplock);
266out_undo_iolock:
267	if (lock_flags & XFS_IOLOCK_EXCL)
268		up_write(&VFS_I(ip)->i_rwsem);
269	else if (lock_flags & XFS_IOLOCK_SHARED)
270		up_read(&VFS_I(ip)->i_rwsem);
271out:
272	return 0;
273}
274
275/*
276 * xfs_iunlock() is used to drop the inode locks acquired with
277 * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
278 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
279 * that we know which locks to drop.
280 *
281 * ip -- the inode being unlocked
282 * lock_flags -- this parameter indicates the inode's locks to be
283 *       to be unlocked.  See the comment for xfs_ilock() for a list
284 *	 of valid values for this parameter.
285 *
286 */
287void
288xfs_iunlock(
289	xfs_inode_t		*ip,
290	uint			lock_flags)
291{
292	/*
293	 * You can't set both SHARED and EXCL for the same lock,
294	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
295	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
296	 */
297	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
298	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
299	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
300	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
301	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
302	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
303	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
304	ASSERT(lock_flags != 0);
305
306	if (lock_flags & XFS_IOLOCK_EXCL)
307		up_write(&VFS_I(ip)->i_rwsem);
308	else if (lock_flags & XFS_IOLOCK_SHARED)
309		up_read(&VFS_I(ip)->i_rwsem);
310
311	if (lock_flags & XFS_MMAPLOCK_EXCL)
312		mrunlock_excl(&ip->i_mmaplock);
313	else if (lock_flags & XFS_MMAPLOCK_SHARED)
314		mrunlock_shared(&ip->i_mmaplock);
315
316	if (lock_flags & XFS_ILOCK_EXCL)
317		mrunlock_excl(&ip->i_lock);
318	else if (lock_flags & XFS_ILOCK_SHARED)
319		mrunlock_shared(&ip->i_lock);
320
321	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
322}
323
324/*
325 * give up write locks.  the i/o lock cannot be held nested
326 * if it is being demoted.
327 */
328void
329xfs_ilock_demote(
330	xfs_inode_t		*ip,
331	uint			lock_flags)
332{
333	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
334	ASSERT((lock_flags &
335		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
336
337	if (lock_flags & XFS_ILOCK_EXCL)
338		mrdemote(&ip->i_lock);
339	if (lock_flags & XFS_MMAPLOCK_EXCL)
340		mrdemote(&ip->i_mmaplock);
341	if (lock_flags & XFS_IOLOCK_EXCL)
342		downgrade_write(&VFS_I(ip)->i_rwsem);
343
344	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
345}
346
347#if defined(DEBUG) || defined(XFS_WARN)
348int
349xfs_isilocked(
350	xfs_inode_t		*ip,
351	uint			lock_flags)
352{
353	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
354		if (!(lock_flags & XFS_ILOCK_SHARED))
355			return !!ip->i_lock.mr_writer;
356		return rwsem_is_locked(&ip->i_lock.mr_lock);
357	}
358
359	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
360		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
361			return !!ip->i_mmaplock.mr_writer;
362		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
363	}
364
365	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
366		if (!(lock_flags & XFS_IOLOCK_SHARED))
367			return !debug_locks ||
368				lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
369		return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
370	}
371
372	ASSERT(0);
373	return 0;
374}
375#endif
376
377/*
378 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
379 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
380 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
381 * errors and warnings.
382 */
383#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
384static bool
385xfs_lockdep_subclass_ok(
386	int subclass)
387{
388	return subclass < MAX_LOCKDEP_SUBCLASSES;
389}
390#else
391#define xfs_lockdep_subclass_ok(subclass)	(true)
392#endif
393
394/*
395 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
396 * value. This can be called for any type of inode lock combination, including
397 * parent locking. Care must be taken to ensure we don't overrun the subclass
398 * storage fields in the class mask we build.
399 */
400static inline int
401xfs_lock_inumorder(int lock_mode, int subclass)
402{
403	int	class = 0;
404
405	ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
406			      XFS_ILOCK_RTSUM)));
407	ASSERT(xfs_lockdep_subclass_ok(subclass));
408
409	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
410		ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
411		class += subclass << XFS_IOLOCK_SHIFT;
412	}
413
414	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
415		ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
416		class += subclass << XFS_MMAPLOCK_SHIFT;
417	}
418
419	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
420		ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
421		class += subclass << XFS_ILOCK_SHIFT;
422	}
423
424	return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
425}
426
427/*
428 * The following routine will lock n inodes in exclusive mode.  We assume the
429 * caller calls us with the inodes in i_ino order.
430 *
431 * We need to detect deadlock where an inode that we lock is in the AIL and we
432 * start waiting for another inode that is locked by a thread in a long running
433 * transaction (such as truncate). This can result in deadlock since the long
434 * running trans might need to wait for the inode we just locked in order to
435 * push the tail and free space in the log.
436 *
437 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
438 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
439 * lock more than one at a time, lockdep will report false positives saying we
440 * have violated locking orders.
441 */
442static void
443xfs_lock_inodes(
444	struct xfs_inode	**ips,
445	int			inodes,
446	uint			lock_mode)
447{
448	int			attempts = 0, i, j, try_lock;
449	struct xfs_log_item	*lp;
450
451	/*
452	 * Currently supports between 2 and 5 inodes with exclusive locking.  We
453	 * support an arbitrary depth of locking here, but absolute limits on
454	 * inodes depend on the type of locking and the limits placed by
455	 * lockdep annotations in xfs_lock_inumorder.  These are all checked by
456	 * the asserts.
457	 */
458	ASSERT(ips && inodes >= 2 && inodes <= 5);
459	ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
460			    XFS_ILOCK_EXCL));
461	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
462			      XFS_ILOCK_SHARED)));
463	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
464		inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
465	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
466		inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
467
468	if (lock_mode & XFS_IOLOCK_EXCL) {
469		ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
470	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
471		ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
472
473	try_lock = 0;
474	i = 0;
475again:
476	for (; i < inodes; i++) {
477		ASSERT(ips[i]);
478
479		if (i && (ips[i] == ips[i - 1]))	/* Already locked */
480			continue;
481
482		/*
483		 * If try_lock is not set yet, make sure all locked inodes are
484		 * not in the AIL.  If any are, set try_lock to be used later.
485		 */
486		if (!try_lock) {
487			for (j = (i - 1); j >= 0 && !try_lock; j--) {
488				lp = &ips[j]->i_itemp->ili_item;
489				if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
490					try_lock++;
491			}
492		}
493
494		/*
495		 * If any of the previous locks we have locked is in the AIL,
496		 * we must TRY to get the second and subsequent locks. If
497		 * we can't get any, we must release all we have
498		 * and try again.
499		 */
500		if (!try_lock) {
501			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
502			continue;
503		}
504
505		/* try_lock means we have an inode locked that is in the AIL. */
506		ASSERT(i != 0);
507		if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
508			continue;
509
510		/*
511		 * Unlock all previous guys and try again.  xfs_iunlock will try
512		 * to push the tail if the inode is in the AIL.
513		 */
514		attempts++;
515		for (j = i - 1; j >= 0; j--) {
516			/*
517			 * Check to see if we've already unlocked this one.  Not
518			 * the first one going back, and the inode ptr is the
519			 * same.
520			 */
521			if (j != (i - 1) && ips[j] == ips[j + 1])
522				continue;
523
524			xfs_iunlock(ips[j], lock_mode);
525		}
526
527		if ((attempts % 5) == 0) {
528			delay(1); /* Don't just spin the CPU */
529		}
530		i = 0;
531		try_lock = 0;
532		goto again;
533	}
534}
535
536/*
537 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
538 * the mmaplock or the ilock, but not more than one type at a time. If we lock
539 * more than one at a time, lockdep will report false positives saying we have
540 * violated locking orders.  The iolock must be double-locked separately since
541 * we use i_rwsem for that.  We now support taking one lock EXCL and the other
542 * SHARED.
543 */
544void
545xfs_lock_two_inodes(
546	struct xfs_inode	*ip0,
547	uint			ip0_mode,
548	struct xfs_inode	*ip1,
549	uint			ip1_mode)
550{
551	struct xfs_inode	*temp;
552	uint			mode_temp;
553	int			attempts = 0;
554	struct xfs_log_item	*lp;
555
556	ASSERT(hweight32(ip0_mode) == 1);
557	ASSERT(hweight32(ip1_mode) == 1);
558	ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
559	ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
560	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
561	       !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
562	ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
563	       !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
564	ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
565	       !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
566	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
567	       !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
568
569	ASSERT(ip0->i_ino != ip1->i_ino);
570
571	if (ip0->i_ino > ip1->i_ino) {
572		temp = ip0;
573		ip0 = ip1;
574		ip1 = temp;
575		mode_temp = ip0_mode;
576		ip0_mode = ip1_mode;
577		ip1_mode = mode_temp;
578	}
579
580 again:
581	xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
582
583	/*
584	 * If the first lock we have locked is in the AIL, we must TRY to get
585	 * the second lock. If we can't get it, we must release the first one
586	 * and try again.
587	 */
588	lp = &ip0->i_itemp->ili_item;
589	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
590		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
591			xfs_iunlock(ip0, ip0_mode);
592			if ((++attempts % 5) == 0)
593				delay(1); /* Don't just spin the CPU */
594			goto again;
595		}
596	} else {
597		xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
598	}
599}
600
601STATIC uint
602_xfs_dic2xflags(
603	uint16_t		di_flags,
604	uint64_t		di_flags2,
605	bool			has_attr)
606{
607	uint			flags = 0;
608
609	if (di_flags & XFS_DIFLAG_ANY) {
610		if (di_flags & XFS_DIFLAG_REALTIME)
611			flags |= FS_XFLAG_REALTIME;
612		if (di_flags & XFS_DIFLAG_PREALLOC)
613			flags |= FS_XFLAG_PREALLOC;
614		if (di_flags & XFS_DIFLAG_IMMUTABLE)
615			flags |= FS_XFLAG_IMMUTABLE;
616		if (di_flags & XFS_DIFLAG_APPEND)
617			flags |= FS_XFLAG_APPEND;
618		if (di_flags & XFS_DIFLAG_SYNC)
619			flags |= FS_XFLAG_SYNC;
620		if (di_flags & XFS_DIFLAG_NOATIME)
621			flags |= FS_XFLAG_NOATIME;
622		if (di_flags & XFS_DIFLAG_NODUMP)
623			flags |= FS_XFLAG_NODUMP;
624		if (di_flags & XFS_DIFLAG_RTINHERIT)
625			flags |= FS_XFLAG_RTINHERIT;
626		if (di_flags & XFS_DIFLAG_PROJINHERIT)
627			flags |= FS_XFLAG_PROJINHERIT;
628		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
629			flags |= FS_XFLAG_NOSYMLINKS;
630		if (di_flags & XFS_DIFLAG_EXTSIZE)
631			flags |= FS_XFLAG_EXTSIZE;
632		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
633			flags |= FS_XFLAG_EXTSZINHERIT;
634		if (di_flags & XFS_DIFLAG_NODEFRAG)
635			flags |= FS_XFLAG_NODEFRAG;
636		if (di_flags & XFS_DIFLAG_FILESTREAM)
637			flags |= FS_XFLAG_FILESTREAM;
638	}
639
640	if (di_flags2 & XFS_DIFLAG2_ANY) {
641		if (di_flags2 & XFS_DIFLAG2_DAX)
642			flags |= FS_XFLAG_DAX;
643		if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
644			flags |= FS_XFLAG_COWEXTSIZE;
645	}
646
647	if (has_attr)
648		flags |= FS_XFLAG_HASATTR;
649
650	return flags;
651}
652
653uint
654xfs_ip2xflags(
655	struct xfs_inode	*ip)
656{
657	struct xfs_icdinode	*dic = &ip->i_d;
658
659	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
660}
661
662/*
663 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
664 * is allowed, otherwise it has to be an exact match. If a CI match is found,
665 * ci_name->name will point to a the actual name (caller must free) or
666 * will be set to NULL if an exact match is found.
667 */
668int
669xfs_lookup(
670	xfs_inode_t		*dp,
671	struct xfs_name		*name,
672	xfs_inode_t		**ipp,
673	struct xfs_name		*ci_name)
674{
675	xfs_ino_t		inum;
676	int			error;
677
678	trace_xfs_lookup(dp, name);
679
680	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
681		return -EIO;
682
683	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
684	if (error)
685		goto out_unlock;
686
687	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
688	if (error)
689		goto out_free_name;
690
691	return 0;
692
693out_free_name:
694	if (ci_name)
695		kmem_free(ci_name->name);
696out_unlock:
697	*ipp = NULL;
698	return error;
699}
700
701/* Propagate di_flags from a parent inode to a child inode. */
702static void
703xfs_inode_inherit_flags(
704	struct xfs_inode	*ip,
705	const struct xfs_inode	*pip)
706{
707	unsigned int		di_flags = 0;
708	umode_t			mode = VFS_I(ip)->i_mode;
709
710	if (S_ISDIR(mode)) {
711		if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
712			di_flags |= XFS_DIFLAG_RTINHERIT;
713		if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
714			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
715			ip->i_d.di_extsize = pip->i_d.di_extsize;
716		}
717		if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
718			di_flags |= XFS_DIFLAG_PROJINHERIT;
719	} else if (S_ISREG(mode)) {
720		if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) &&
721		    xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
722			di_flags |= XFS_DIFLAG_REALTIME;
723		if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
724			di_flags |= XFS_DIFLAG_EXTSIZE;
725			ip->i_d.di_extsize = pip->i_d.di_extsize;
726		}
727	}
728	if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
729	    xfs_inherit_noatime)
730		di_flags |= XFS_DIFLAG_NOATIME;
731	if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
732	    xfs_inherit_nodump)
733		di_flags |= XFS_DIFLAG_NODUMP;
734	if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
735	    xfs_inherit_sync)
736		di_flags |= XFS_DIFLAG_SYNC;
737	if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
738	    xfs_inherit_nosymlinks)
739		di_flags |= XFS_DIFLAG_NOSYMLINKS;
740	if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
741	    xfs_inherit_nodefrag)
742		di_flags |= XFS_DIFLAG_NODEFRAG;
743	if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
744		di_flags |= XFS_DIFLAG_FILESTREAM;
745
746	ip->i_d.di_flags |= di_flags;
747}
748
749/* Propagate di_flags2 from a parent inode to a child inode. */
750static void
751xfs_inode_inherit_flags2(
752	struct xfs_inode	*ip,
753	const struct xfs_inode	*pip)
754{
755	if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
756		ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
757		ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
758	}
759	if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
760		ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
761}
762
763/*
764 * Allocate an inode on disk and return a copy of its in-core version.
765 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
766 * appropriately within the inode.  The uid and gid for the inode are
767 * set according to the contents of the given cred structure.
768 *
769 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
770 * has a free inode available, call xfs_iget() to obtain the in-core
771 * version of the allocated inode.  Finally, fill in the inode and
772 * log its initial contents.  In this case, ialloc_context would be
773 * set to NULL.
774 *
775 * If xfs_dialloc() does not have an available inode, it will replenish
776 * its supply by doing an allocation. Since we can only do one
777 * allocation within a transaction without deadlocks, we must commit
778 * the current transaction before returning the inode itself.
779 * In this case, therefore, we will set ialloc_context and return.
780 * The caller should then commit the current transaction, start a new
781 * transaction, and call xfs_ialloc() again to actually get the inode.
782 *
783 * To ensure that some other process does not grab the inode that
784 * was allocated during the first call to xfs_ialloc(), this routine
785 * also returns the [locked] bp pointing to the head of the freelist
786 * as ialloc_context.  The caller should hold this buffer across
787 * the commit and pass it back into this routine on the second call.
788 *
789 * If we are allocating quota inodes, we do not have a parent inode
790 * to attach to or associate with (i.e. pip == NULL) because they
791 * are not linked into the directory structure - they are attached
792 * directly to the superblock - and so have no parent.
793 */
794static int
795xfs_ialloc(
796	xfs_trans_t	*tp,
797	xfs_inode_t	*pip,
798	umode_t		mode,
799	xfs_nlink_t	nlink,
800	dev_t		rdev,
801	prid_t		prid,
802	xfs_buf_t	**ialloc_context,
803	xfs_inode_t	**ipp)
804{
805	struct inode *dir = pip ? VFS_I(pip) : NULL;
806	struct xfs_mount *mp = tp->t_mountp;
807	xfs_ino_t	ino;
808	xfs_inode_t	*ip;
809	uint		flags;
810	int		error;
811	struct timespec64 tv;
812	struct inode	*inode;
813
814	/*
815	 * Call the space management code to pick
816	 * the on-disk inode to be allocated.
817	 */
818	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
819			    ialloc_context, &ino);
820	if (error)
821		return error;
822	if (*ialloc_context || ino == NULLFSINO) {
823		*ipp = NULL;
824		return 0;
825	}
826	ASSERT(*ialloc_context == NULL);
827
828	/*
829	 * Protect against obviously corrupt allocation btree records. Later
830	 * xfs_iget checks will catch re-allocation of other active in-memory
831	 * and on-disk inodes. If we don't catch reallocating the parent inode
832	 * here we will deadlock in xfs_iget() so we have to do these checks
833	 * first.
834	 */
835	if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
836		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
837		return -EFSCORRUPTED;
838	}
839
840	/*
841	 * Get the in-core inode with the lock held exclusively.
842	 * This is because we're setting fields here we need
843	 * to prevent others from looking at until we're done.
844	 */
845	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
846			 XFS_ILOCK_EXCL, &ip);
847	if (error)
848		return error;
849	ASSERT(ip != NULL);
850	inode = VFS_I(ip);
851	set_nlink(inode, nlink);
852	inode->i_rdev = rdev;
853	ip->i_d.di_projid = prid;
854
855	if (dir && !(dir->i_mode & S_ISGID) &&
856			(mp->m_flags & XFS_MOUNT_GRPID)) {
857		inode->i_uid = current_fsuid();
858		inode->i_gid = dir->i_gid;
859		inode->i_mode = mode;
860	} else {
861		inode_init_owner(inode, dir, mode);
862	}
863
864	/*
865	 * If the group ID of the new file does not match the effective group
866	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
867	 * (and only if the irix_sgid_inherit compatibility variable is set).
868	 */
869	if (irix_sgid_inherit &&
870	    (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
871		inode->i_mode &= ~S_ISGID;
872
873	ip->i_d.di_size = 0;
874	ip->i_df.if_nextents = 0;
875	ASSERT(ip->i_d.di_nblocks == 0);
876
877	tv = current_time(inode);
878	inode->i_mtime = tv;
879	inode->i_atime = tv;
880	inode->i_ctime = tv;
881
882	ip->i_d.di_extsize = 0;
883	ip->i_d.di_dmevmask = 0;
884	ip->i_d.di_dmstate = 0;
885	ip->i_d.di_flags = 0;
886
887	if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
888		inode_set_iversion(inode, 1);
889		ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2;
890		ip->i_d.di_cowextsize = 0;
891		ip->i_d.di_crtime = tv;
892	}
893
894	flags = XFS_ILOG_CORE;
895	switch (mode & S_IFMT) {
896	case S_IFIFO:
897	case S_IFCHR:
898	case S_IFBLK:
899	case S_IFSOCK:
900		ip->i_df.if_format = XFS_DINODE_FMT_DEV;
901		ip->i_df.if_flags = 0;
902		flags |= XFS_ILOG_DEV;
903		break;
904	case S_IFREG:
905	case S_IFDIR:
906		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY))
907			xfs_inode_inherit_flags(ip, pip);
908		if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY))
909			xfs_inode_inherit_flags2(ip, pip);
910		/* FALLTHROUGH */
911	case S_IFLNK:
912		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
913		ip->i_df.if_flags = XFS_IFEXTENTS;
914		ip->i_df.if_bytes = 0;
915		ip->i_df.if_u1.if_root = NULL;
916		break;
917	default:
918		ASSERT(0);
919	}
920
921	/*
922	 * Log the new values stuffed into the inode.
923	 */
924	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
925	xfs_trans_log_inode(tp, ip, flags);
926
927	/* now that we have an i_mode we can setup the inode structure */
928	xfs_setup_inode(ip);
929
930	*ipp = ip;
931	return 0;
932}
933
934/*
935 * Allocates a new inode from disk and return a pointer to the
936 * incore copy. This routine will internally commit the current
937 * transaction and allocate a new one if the Space Manager needed
938 * to do an allocation to replenish the inode free-list.
939 *
940 * This routine is designed to be called from xfs_create and
941 * xfs_create_dir.
942 *
943 */
944int
945xfs_dir_ialloc(
946	xfs_trans_t	**tpp,		/* input: current transaction;
947					   output: may be a new transaction. */
948	xfs_inode_t	*dp,		/* directory within whose allocate
949					   the inode. */
950	umode_t		mode,
951	xfs_nlink_t	nlink,
952	dev_t		rdev,
953	prid_t		prid,		/* project id */
954	xfs_inode_t	**ipp)		/* pointer to inode; it will be
955					   locked. */
956{
957	xfs_trans_t	*tp;
958	xfs_inode_t	*ip;
959	xfs_buf_t	*ialloc_context = NULL;
960	int		code;
961	void		*dqinfo;
962	uint		tflags;
963
964	tp = *tpp;
965	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
966
967	/*
968	 * xfs_ialloc will return a pointer to an incore inode if
969	 * the Space Manager has an available inode on the free
970	 * list. Otherwise, it will do an allocation and replenish
971	 * the freelist.  Since we can only do one allocation per
972	 * transaction without deadlocks, we will need to commit the
973	 * current transaction and start a new one.  We will then
974	 * need to call xfs_ialloc again to get the inode.
975	 *
976	 * If xfs_ialloc did an allocation to replenish the freelist,
977	 * it returns the bp containing the head of the freelist as
978	 * ialloc_context. We will hold a lock on it across the
979	 * transaction commit so that no other process can steal
980	 * the inode(s) that we've just allocated.
981	 */
982	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
983			&ip);
984
985	/*
986	 * Return an error if we were unable to allocate a new inode.
987	 * This should only happen if we run out of space on disk or
988	 * encounter a disk error.
989	 */
990	if (code) {
991		*ipp = NULL;
992		return code;
993	}
994	if (!ialloc_context && !ip) {
995		*ipp = NULL;
996		return -ENOSPC;
997	}
998
999	/*
1000	 * If the AGI buffer is non-NULL, then we were unable to get an
1001	 * inode in one operation.  We need to commit the current
1002	 * transaction and call xfs_ialloc() again.  It is guaranteed
1003	 * to succeed the second time.
1004	 */
1005	if (ialloc_context) {
1006		/*
1007		 * Normally, xfs_trans_commit releases all the locks.
1008		 * We call bhold to hang on to the ialloc_context across
1009		 * the commit.  Holding this buffer prevents any other
1010		 * processes from doing any allocations in this
1011		 * allocation group.
1012		 */
1013		xfs_trans_bhold(tp, ialloc_context);
1014
1015		/*
1016		 * We want the quota changes to be associated with the next
1017		 * transaction, NOT this one. So, detach the dqinfo from this
1018		 * and attach it to the next transaction.
1019		 */
1020		dqinfo = NULL;
1021		tflags = 0;
1022		if (tp->t_dqinfo) {
1023			dqinfo = (void *)tp->t_dqinfo;
1024			tp->t_dqinfo = NULL;
1025			tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
1026			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
1027		}
1028
1029		code = xfs_trans_roll(&tp);
1030
1031		/*
1032		 * Re-attach the quota info that we detached from prev trx.
1033		 */
1034		if (dqinfo) {
1035			tp->t_dqinfo = dqinfo;
1036			tp->t_flags |= tflags;
1037		}
1038
1039		if (code) {
1040			xfs_buf_relse(ialloc_context);
1041			*tpp = tp;
1042			*ipp = NULL;
1043			return code;
1044		}
1045		xfs_trans_bjoin(tp, ialloc_context);
1046
1047		/*
1048		 * Call ialloc again. Since we've locked out all
1049		 * other allocations in this allocation group,
1050		 * this call should always succeed.
1051		 */
1052		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1053				  &ialloc_context, &ip);
1054
1055		/*
1056		 * If we get an error at this point, return to the caller
1057		 * so that the current transaction can be aborted.
1058		 */
1059		if (code) {
1060			*tpp = tp;
1061			*ipp = NULL;
1062			return code;
1063		}
1064		ASSERT(!ialloc_context && ip);
1065
1066	}
1067
1068	*ipp = ip;
1069	*tpp = tp;
1070
1071	return 0;
1072}
1073
1074/*
1075 * Decrement the link count on an inode & log the change.  If this causes the
1076 * link count to go to zero, move the inode to AGI unlinked list so that it can
1077 * be freed when the last active reference goes away via xfs_inactive().
1078 */
1079static int			/* error */
1080xfs_droplink(
1081	xfs_trans_t *tp,
1082	xfs_inode_t *ip)
1083{
1084	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1085
1086	drop_nlink(VFS_I(ip));
1087	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1088
1089	if (VFS_I(ip)->i_nlink)
1090		return 0;
1091
1092	return xfs_iunlink(tp, ip);
1093}
1094
1095/*
1096 * Increment the link count on an inode & log the change.
1097 */
1098static void
1099xfs_bumplink(
1100	xfs_trans_t *tp,
1101	xfs_inode_t *ip)
1102{
1103	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1104
1105	inc_nlink(VFS_I(ip));
1106	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1107}
1108
1109int
1110xfs_create(
1111	xfs_inode_t		*dp,
1112	struct xfs_name		*name,
1113	umode_t			mode,
1114	dev_t			rdev,
1115	xfs_inode_t		**ipp)
1116{
1117	int			is_dir = S_ISDIR(mode);
1118	struct xfs_mount	*mp = dp->i_mount;
1119	struct xfs_inode	*ip = NULL;
1120	struct xfs_trans	*tp = NULL;
1121	int			error;
1122	bool                    unlock_dp_on_error = false;
1123	prid_t			prid;
1124	struct xfs_dquot	*udqp = NULL;
1125	struct xfs_dquot	*gdqp = NULL;
1126	struct xfs_dquot	*pdqp = NULL;
1127	struct xfs_trans_res	*tres;
1128	uint			resblks;
1129
1130	trace_xfs_create(dp, name);
1131
1132	if (XFS_FORCED_SHUTDOWN(mp))
1133		return -EIO;
1134
1135	prid = xfs_get_initial_prid(dp);
1136
1137	/*
1138	 * Make sure that we have allocated dquot(s) on disk.
1139	 */
1140	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1141					XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1142					&udqp, &gdqp, &pdqp);
1143	if (error)
1144		return error;
1145
1146	if (is_dir) {
1147		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1148		tres = &M_RES(mp)->tr_mkdir;
1149	} else {
1150		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1151		tres = &M_RES(mp)->tr_create;
1152	}
1153
1154	/*
1155	 * Initially assume that the file does not exist and
1156	 * reserve the resources for that case.  If that is not
1157	 * the case we'll drop the one we have and get a more
1158	 * appropriate transaction later.
1159	 */
1160	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1161	if (error == -ENOSPC) {
1162		/* flush outstanding delalloc blocks and retry */
1163		xfs_flush_inodes(mp);
1164		error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1165	}
1166	if (error)
1167		goto out_release_inode;
1168
1169	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1170	unlock_dp_on_error = true;
1171
1172	/*
1173	 * Reserve disk quota and the inode.
1174	 */
1175	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1176						pdqp, resblks, 1, 0);
1177	if (error)
1178		goto out_trans_cancel;
1179
1180	/*
1181	 * A newly created regular or special file just has one directory
1182	 * entry pointing to them, but a directory also the "." entry
1183	 * pointing to itself.
1184	 */
1185	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
1186	if (error)
1187		goto out_trans_cancel;
1188
1189	/*
1190	 * Now we join the directory inode to the transaction.  We do not do it
1191	 * earlier because xfs_dir_ialloc might commit the previous transaction
1192	 * (and release all the locks).  An error from here on will result in
1193	 * the transaction cancel unlocking dp so don't do it explicitly in the
1194	 * error path.
1195	 */
1196	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1197	unlock_dp_on_error = false;
1198
1199	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1200					resblks - XFS_IALLOC_SPACE_RES(mp));
1201	if (error) {
1202		ASSERT(error != -ENOSPC);
1203		goto out_trans_cancel;
1204	}
1205	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1206	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1207
1208	if (is_dir) {
1209		error = xfs_dir_init(tp, ip, dp);
1210		if (error)
1211			goto out_trans_cancel;
1212
1213		xfs_bumplink(tp, dp);
1214	}
1215
1216	/*
1217	 * If this is a synchronous mount, make sure that the
1218	 * create transaction goes to disk before returning to
1219	 * the user.
1220	 */
1221	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1222		xfs_trans_set_sync(tp);
1223
1224	/*
1225	 * Attach the dquot(s) to the inodes and modify them incore.
1226	 * These ids of the inode couldn't have changed since the new
1227	 * inode has been locked ever since it was created.
1228	 */
1229	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1230
1231	error = xfs_trans_commit(tp);
1232	if (error)
1233		goto out_release_inode;
1234
1235	xfs_qm_dqrele(udqp);
1236	xfs_qm_dqrele(gdqp);
1237	xfs_qm_dqrele(pdqp);
1238
1239	*ipp = ip;
1240	return 0;
1241
1242 out_trans_cancel:
1243	xfs_trans_cancel(tp);
1244 out_release_inode:
1245	/*
1246	 * Wait until after the current transaction is aborted to finish the
1247	 * setup of the inode and release the inode.  This prevents recursive
1248	 * transactions and deadlocks from xfs_inactive.
1249	 */
1250	if (ip) {
1251		xfs_finish_inode_setup(ip);
1252		xfs_irele(ip);
1253	}
1254
1255	xfs_qm_dqrele(udqp);
1256	xfs_qm_dqrele(gdqp);
1257	xfs_qm_dqrele(pdqp);
1258
1259	if (unlock_dp_on_error)
1260		xfs_iunlock(dp, XFS_ILOCK_EXCL);
1261	return error;
1262}
1263
1264int
1265xfs_create_tmpfile(
1266	struct xfs_inode	*dp,
1267	umode_t			mode,
1268	struct xfs_inode	**ipp)
1269{
1270	struct xfs_mount	*mp = dp->i_mount;
1271	struct xfs_inode	*ip = NULL;
1272	struct xfs_trans	*tp = NULL;
1273	int			error;
1274	prid_t                  prid;
1275	struct xfs_dquot	*udqp = NULL;
1276	struct xfs_dquot	*gdqp = NULL;
1277	struct xfs_dquot	*pdqp = NULL;
1278	struct xfs_trans_res	*tres;
1279	uint			resblks;
1280
1281	if (XFS_FORCED_SHUTDOWN(mp))
1282		return -EIO;
1283
1284	prid = xfs_get_initial_prid(dp);
1285
1286	/*
1287	 * Make sure that we have allocated dquot(s) on disk.
1288	 */
1289	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1290				XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1291				&udqp, &gdqp, &pdqp);
1292	if (error)
1293		return error;
1294
1295	resblks = XFS_IALLOC_SPACE_RES(mp);
1296	tres = &M_RES(mp)->tr_create_tmpfile;
1297
1298	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1299	if (error)
1300		goto out_release_inode;
1301
1302	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1303						pdqp, resblks, 1, 0);
1304	if (error)
1305		goto out_trans_cancel;
1306
1307	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
1308	if (error)
1309		goto out_trans_cancel;
1310
1311	if (mp->m_flags & XFS_MOUNT_WSYNC)
1312		xfs_trans_set_sync(tp);
1313
1314	/*
1315	 * Attach the dquot(s) to the inodes and modify them incore.
1316	 * These ids of the inode couldn't have changed since the new
1317	 * inode has been locked ever since it was created.
1318	 */
1319	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1320
1321	error = xfs_iunlink(tp, ip);
1322	if (error)
1323		goto out_trans_cancel;
1324
1325	error = xfs_trans_commit(tp);
1326	if (error)
1327		goto out_release_inode;
1328
1329	xfs_qm_dqrele(udqp);
1330	xfs_qm_dqrele(gdqp);
1331	xfs_qm_dqrele(pdqp);
1332
1333	*ipp = ip;
1334	return 0;
1335
1336 out_trans_cancel:
1337	xfs_trans_cancel(tp);
1338 out_release_inode:
1339	/*
1340	 * Wait until after the current transaction is aborted to finish the
1341	 * setup of the inode and release the inode.  This prevents recursive
1342	 * transactions and deadlocks from xfs_inactive.
1343	 */
1344	if (ip) {
1345		xfs_finish_inode_setup(ip);
1346		xfs_irele(ip);
1347	}
1348
1349	xfs_qm_dqrele(udqp);
1350	xfs_qm_dqrele(gdqp);
1351	xfs_qm_dqrele(pdqp);
1352
1353	return error;
1354}
1355
1356int
1357xfs_link(
1358	xfs_inode_t		*tdp,
1359	xfs_inode_t		*sip,
1360	struct xfs_name		*target_name)
1361{
1362	xfs_mount_t		*mp = tdp->i_mount;
1363	xfs_trans_t		*tp;
1364	int			error;
1365	int			resblks;
1366
1367	trace_xfs_link(tdp, target_name);
1368
1369	ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1370
1371	if (XFS_FORCED_SHUTDOWN(mp))
1372		return -EIO;
1373
1374	error = xfs_qm_dqattach(sip);
1375	if (error)
1376		goto std_return;
1377
1378	error = xfs_qm_dqattach(tdp);
1379	if (error)
1380		goto std_return;
1381
1382	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1383	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
1384	if (error == -ENOSPC) {
1385		resblks = 0;
1386		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
1387	}
1388	if (error)
1389		goto std_return;
1390
1391	xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
1392
1393	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1394	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1395
1396	/*
1397	 * If we are using project inheritance, we only allow hard link
1398	 * creation in our tree when the project IDs are the same; else
1399	 * the tree quota mechanism could be circumvented.
1400	 */
1401	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1402		     tdp->i_d.di_projid != sip->i_d.di_projid)) {
1403		error = -EXDEV;
1404		goto error_return;
1405	}
1406
1407	if (!resblks) {
1408		error = xfs_dir_canenter(tp, tdp, target_name);
1409		if (error)
1410			goto error_return;
1411	}
1412
1413	/*
1414	 * Handle initial link state of O_TMPFILE inode
1415	 */
1416	if (VFS_I(sip)->i_nlink == 0) {
1417		error = xfs_iunlink_remove(tp, sip);
1418		if (error)
1419			goto error_return;
1420	}
1421
1422	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1423				   resblks);
1424	if (error)
1425		goto error_return;
1426	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1427	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1428
1429	xfs_bumplink(tp, sip);
1430
1431	/*
1432	 * If this is a synchronous mount, make sure that the
1433	 * link transaction goes to disk before returning to
1434	 * the user.
1435	 */
1436	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1437		xfs_trans_set_sync(tp);
1438
1439	return xfs_trans_commit(tp);
1440
1441 error_return:
1442	xfs_trans_cancel(tp);
1443 std_return:
1444	return error;
1445}
1446
1447/* Clear the reflink flag and the cowblocks tag if possible. */
1448static void
1449xfs_itruncate_clear_reflink_flags(
1450	struct xfs_inode	*ip)
1451{
1452	struct xfs_ifork	*dfork;
1453	struct xfs_ifork	*cfork;
1454
1455	if (!xfs_is_reflink_inode(ip))
1456		return;
1457	dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1458	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
1459	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1460		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
1461	if (cfork->if_bytes == 0)
1462		xfs_inode_clear_cowblocks_tag(ip);
1463}
1464
1465/*
1466 * Free up the underlying blocks past new_size.  The new size must be smaller
1467 * than the current size.  This routine can be used both for the attribute and
1468 * data fork, and does not modify the inode size, which is left to the caller.
1469 *
1470 * The transaction passed to this routine must have made a permanent log
1471 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1472 * given transaction and start new ones, so make sure everything involved in
1473 * the transaction is tidy before calling here.  Some transaction will be
1474 * returned to the caller to be committed.  The incoming transaction must
1475 * already include the inode, and both inode locks must be held exclusively.
1476 * The inode must also be "held" within the transaction.  On return the inode
1477 * will be "held" within the returned transaction.  This routine does NOT
1478 * require any disk space to be reserved for it within the transaction.
1479 *
1480 * If we get an error, we must return with the inode locked and linked into the
1481 * current transaction. This keeps things simple for the higher level code,
1482 * because it always knows that the inode is locked and held in the transaction
1483 * that returns to it whether errors occur or not.  We don't mark the inode
1484 * dirty on error so that transactions can be easily aborted if possible.
1485 */
1486int
1487xfs_itruncate_extents_flags(
1488	struct xfs_trans	**tpp,
1489	struct xfs_inode	*ip,
1490	int			whichfork,
1491	xfs_fsize_t		new_size,
1492	int			flags)
1493{
1494	struct xfs_mount	*mp = ip->i_mount;
1495	struct xfs_trans	*tp = *tpp;
1496	xfs_fileoff_t		first_unmap_block;
1497	xfs_filblks_t		unmap_len;
1498	int			error = 0;
1499
1500	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1501	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1502	       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1503	ASSERT(new_size <= XFS_ISIZE(ip));
1504	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1505	ASSERT(ip->i_itemp != NULL);
1506	ASSERT(ip->i_itemp->ili_lock_flags == 0);
1507	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1508
1509	trace_xfs_itruncate_extents_start(ip, new_size);
1510
1511	flags |= xfs_bmapi_aflag(whichfork);
1512
1513	/*
1514	 * Since it is possible for space to become allocated beyond
1515	 * the end of the file (in a crash where the space is allocated
1516	 * but the inode size is not yet updated), simply remove any
1517	 * blocks which show up between the new EOF and the maximum
1518	 * possible file size.
1519	 *
1520	 * We have to free all the blocks to the bmbt maximum offset, even if
1521	 * the page cache can't scale that far.
1522	 */
1523	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1524	if (first_unmap_block >= XFS_MAX_FILEOFF) {
1525		WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1526		return 0;
1527	}
1528
1529	unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
1530	while (unmap_len > 0) {
1531		ASSERT(tp->t_firstblock == NULLFSBLOCK);
1532		error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
1533				flags, XFS_ITRUNC_MAX_EXTENTS);
1534		if (error)
1535			goto out;
1536
1537		/* free the just unmapped extents */
1538		error = xfs_defer_finish(&tp);
1539		if (error)
1540			goto out;
1541	}
1542
1543	if (whichfork == XFS_DATA_FORK) {
1544		/* Remove all pending CoW reservations. */
1545		error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1546				first_unmap_block, XFS_MAX_FILEOFF, true);
1547		if (error)
1548			goto out;
1549
1550		xfs_itruncate_clear_reflink_flags(ip);
1551	}
1552
1553	/*
1554	 * Always re-log the inode so that our permanent transaction can keep
1555	 * on rolling it forward in the log.
1556	 */
1557	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1558
1559	trace_xfs_itruncate_extents_end(ip, new_size);
1560
1561out:
1562	*tpp = tp;
1563	return error;
1564}
1565
1566int
1567xfs_release(
1568	xfs_inode_t	*ip)
1569{
1570	xfs_mount_t	*mp = ip->i_mount;
1571	int		error;
1572
1573	if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1574		return 0;
1575
1576	/* If this is a read-only mount, don't do this (would generate I/O) */
1577	if (mp->m_flags & XFS_MOUNT_RDONLY)
1578		return 0;
1579
1580	if (!XFS_FORCED_SHUTDOWN(mp)) {
1581		int truncated;
1582
1583		/*
1584		 * If we previously truncated this file and removed old data
1585		 * in the process, we want to initiate "early" writeout on
1586		 * the last close.  This is an attempt to combat the notorious
1587		 * NULL files problem which is particularly noticeable from a
1588		 * truncate down, buffered (re-)write (delalloc), followed by
1589		 * a crash.  What we are effectively doing here is
1590		 * significantly reducing the time window where we'd otherwise
1591		 * be exposed to that problem.
1592		 */
1593		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1594		if (truncated) {
1595			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1596			if (ip->i_delayed_blks > 0) {
1597				error = filemap_flush(VFS_I(ip)->i_mapping);
1598				if (error)
1599					return error;
1600			}
1601		}
1602	}
1603
1604	if (VFS_I(ip)->i_nlink == 0)
1605		return 0;
1606
1607	if (xfs_can_free_eofblocks(ip, false)) {
1608
1609		/*
1610		 * Check if the inode is being opened, written and closed
1611		 * frequently and we have delayed allocation blocks outstanding
1612		 * (e.g. streaming writes from the NFS server), truncating the
1613		 * blocks past EOF will cause fragmentation to occur.
1614		 *
1615		 * In this case don't do the truncation, but we have to be
1616		 * careful how we detect this case. Blocks beyond EOF show up as
1617		 * i_delayed_blks even when the inode is clean, so we need to
1618		 * truncate them away first before checking for a dirty release.
1619		 * Hence on the first dirty close we will still remove the
1620		 * speculative allocation, but after that we will leave it in
1621		 * place.
1622		 */
1623		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1624			return 0;
1625		/*
1626		 * If we can't get the iolock just skip truncating the blocks
1627		 * past EOF because we could deadlock with the mmap_lock
1628		 * otherwise. We'll get another chance to drop them once the
1629		 * last reference to the inode is dropped, so we'll never leak
1630		 * blocks permanently.
1631		 */
1632		if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1633			error = xfs_free_eofblocks(ip);
1634			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1635			if (error)
1636				return error;
1637		}
1638
1639		/* delalloc blocks after truncation means it really is dirty */
1640		if (ip->i_delayed_blks)
1641			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1642	}
1643	return 0;
1644}
1645
1646/*
1647 * xfs_inactive_truncate
1648 *
1649 * Called to perform a truncate when an inode becomes unlinked.
1650 */
1651STATIC int
1652xfs_inactive_truncate(
1653	struct xfs_inode *ip)
1654{
1655	struct xfs_mount	*mp = ip->i_mount;
1656	struct xfs_trans	*tp;
1657	int			error;
1658
1659	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1660	if (error) {
1661		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1662		return error;
1663	}
1664	xfs_ilock(ip, XFS_ILOCK_EXCL);
1665	xfs_trans_ijoin(tp, ip, 0);
1666
1667	/*
1668	 * Log the inode size first to prevent stale data exposure in the event
1669	 * of a system crash before the truncate completes. See the related
1670	 * comment in xfs_vn_setattr_size() for details.
1671	 */
1672	ip->i_d.di_size = 0;
1673	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1674
1675	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1676	if (error)
1677		goto error_trans_cancel;
1678
1679	ASSERT(ip->i_df.if_nextents == 0);
1680
1681	error = xfs_trans_commit(tp);
1682	if (error)
1683		goto error_unlock;
1684
1685	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1686	return 0;
1687
1688error_trans_cancel:
1689	xfs_trans_cancel(tp);
1690error_unlock:
1691	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1692	return error;
1693}
1694
1695/*
1696 * xfs_inactive_ifree()
1697 *
1698 * Perform the inode free when an inode is unlinked.
1699 */
1700STATIC int
1701xfs_inactive_ifree(
1702	struct xfs_inode *ip)
1703{
1704	struct xfs_mount	*mp = ip->i_mount;
1705	struct xfs_trans	*tp;
1706	int			error;
1707
1708	/*
1709	 * We try to use a per-AG reservation for any block needed by the finobt
1710	 * tree, but as the finobt feature predates the per-AG reservation
1711	 * support a degraded file system might not have enough space for the
1712	 * reservation at mount time.  In that case try to dip into the reserved
1713	 * pool and pray.
1714	 *
1715	 * Send a warning if the reservation does happen to fail, as the inode
1716	 * now remains allocated and sits on the unlinked list until the fs is
1717	 * repaired.
1718	 */
1719	if (unlikely(mp->m_finobt_nores)) {
1720		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1721				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1722				&tp);
1723	} else {
1724		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1725	}
1726	if (error) {
1727		if (error == -ENOSPC) {
1728			xfs_warn_ratelimited(mp,
1729			"Failed to remove inode(s) from unlinked list. "
1730			"Please free space, unmount and run xfs_repair.");
1731		} else {
1732			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1733		}
1734		return error;
1735	}
1736
1737	/*
1738	 * We do not hold the inode locked across the entire rolling transaction
1739	 * here. We only need to hold it for the first transaction that
1740	 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1741	 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1742	 * here breaks the relationship between cluster buffer invalidation and
1743	 * stale inode invalidation on cluster buffer item journal commit
1744	 * completion, and can result in leaving dirty stale inodes hanging
1745	 * around in memory.
1746	 *
1747	 * We have no need for serialising this inode operation against other
1748	 * operations - we freed the inode and hence reallocation is required
1749	 * and that will serialise on reallocating the space the deferops need
1750	 * to free. Hence we can unlock the inode on the first commit of
1751	 * the transaction rather than roll it right through the deferops. This
1752	 * avoids relogging the XFS_ISTALE inode.
1753	 *
1754	 * We check that xfs_ifree() hasn't grown an internal transaction roll
1755	 * by asserting that the inode is still locked when it returns.
1756	 */
1757	xfs_ilock(ip, XFS_ILOCK_EXCL);
1758	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1759
1760	error = xfs_ifree(tp, ip);
1761	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1762	if (error) {
1763		/*
1764		 * If we fail to free the inode, shut down.  The cancel
1765		 * might do that, we need to make sure.  Otherwise the
1766		 * inode might be lost for a long time or forever.
1767		 */
1768		if (!XFS_FORCED_SHUTDOWN(mp)) {
1769			xfs_notice(mp, "%s: xfs_ifree returned error %d",
1770				__func__, error);
1771			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1772		}
1773		xfs_trans_cancel(tp);
1774		return error;
1775	}
1776
1777	/*
1778	 * Credit the quota account(s). The inode is gone.
1779	 */
1780	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1781
1782	/*
1783	 * Just ignore errors at this point.  There is nothing we can do except
1784	 * to try to keep going. Make sure it's not a silent error.
1785	 */
1786	error = xfs_trans_commit(tp);
1787	if (error)
1788		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1789			__func__, error);
1790
1791	return 0;
1792}
1793
1794/*
1795 * xfs_inactive
1796 *
1797 * This is called when the vnode reference count for the vnode
1798 * goes to zero.  If the file has been unlinked, then it must
1799 * now be truncated.  Also, we clear all of the read-ahead state
1800 * kept for the inode here since the file is now closed.
1801 */
1802void
1803xfs_inactive(
1804	xfs_inode_t	*ip)
1805{
1806	struct xfs_mount	*mp;
1807	int			error;
1808	int			truncate = 0;
1809
1810	/*
1811	 * If the inode is already free, then there can be nothing
1812	 * to clean up here.
1813	 */
1814	if (VFS_I(ip)->i_mode == 0) {
1815		ASSERT(ip->i_df.if_broot_bytes == 0);
1816		return;
1817	}
1818
1819	mp = ip->i_mount;
1820	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1821
1822	/* If this is a read-only mount, don't do this (would generate I/O) */
1823	if (mp->m_flags & XFS_MOUNT_RDONLY)
1824		return;
1825
1826	/* Try to clean out the cow blocks if there are any. */
1827	if (xfs_inode_has_cow_data(ip))
1828		xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1829
1830	if (VFS_I(ip)->i_nlink != 0) {
1831		/*
1832		 * force is true because we are evicting an inode from the
1833		 * cache. Post-eof blocks must be freed, lest we end up with
1834		 * broken free space accounting.
1835		 *
1836		 * Note: don't bother with iolock here since lockdep complains
1837		 * about acquiring it in reclaim context. We have the only
1838		 * reference to the inode at this point anyways.
1839		 */
1840		if (xfs_can_free_eofblocks(ip, true))
1841			xfs_free_eofblocks(ip);
1842
1843		return;
1844	}
1845
1846	if (S_ISREG(VFS_I(ip)->i_mode) &&
1847	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1848	     ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1849		truncate = 1;
1850
1851	error = xfs_qm_dqattach(ip);
1852	if (error)
1853		return;
1854
1855	if (S_ISLNK(VFS_I(ip)->i_mode))
1856		error = xfs_inactive_symlink(ip);
1857	else if (truncate)
1858		error = xfs_inactive_truncate(ip);
1859	if (error)
1860		return;
1861
1862	/*
1863	 * If there are attributes associated with the file then blow them away
1864	 * now.  The code calls a routine that recursively deconstructs the
1865	 * attribute fork. If also blows away the in-core attribute fork.
1866	 */
1867	if (XFS_IFORK_Q(ip)) {
1868		error = xfs_attr_inactive(ip);
1869		if (error)
1870			return;
1871	}
1872
1873	ASSERT(!ip->i_afp);
1874	ASSERT(ip->i_d.di_forkoff == 0);
1875
1876	/*
1877	 * Free the inode.
1878	 */
1879	error = xfs_inactive_ifree(ip);
1880	if (error)
1881		return;
1882
1883	/*
1884	 * Release the dquots held by inode, if any.
1885	 */
1886	xfs_qm_dqdetach(ip);
1887}
1888
1889/*
1890 * In-Core Unlinked List Lookups
1891 * =============================
1892 *
1893 * Every inode is supposed to be reachable from some other piece of metadata
1894 * with the exception of the root directory.  Inodes with a connection to a
1895 * file descriptor but not linked from anywhere in the on-disk directory tree
1896 * are collectively known as unlinked inodes, though the filesystem itself
1897 * maintains links to these inodes so that on-disk metadata are consistent.
1898 *
1899 * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
1900 * header contains a number of buckets that point to an inode, and each inode
1901 * record has a pointer to the next inode in the hash chain.  This
1902 * singly-linked list causes scaling problems in the iunlink remove function
1903 * because we must walk that list to find the inode that points to the inode
1904 * being removed from the unlinked hash bucket list.
1905 *
1906 * What if we modelled the unlinked list as a collection of records capturing
1907 * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
1908 * have a fast way to look up unlinked list predecessors, which avoids the
1909 * slow list walk.  That's exactly what we do here (in-core) with a per-AG
1910 * rhashtable.
1911 *
1912 * Because this is a backref cache, we ignore operational failures since the
1913 * iunlink code can fall back to the slow bucket walk.  The only errors that
1914 * should bubble out are for obviously incorrect situations.
1915 *
1916 * All users of the backref cache MUST hold the AGI buffer lock to serialize
1917 * access or have otherwise provided for concurrency control.
1918 */
1919
1920/* Capture a "X.next_unlinked = Y" relationship. */
1921struct xfs_iunlink {
1922	struct rhash_head	iu_rhash_head;
1923	xfs_agino_t		iu_agino;		/* X */
1924	xfs_agino_t		iu_next_unlinked;	/* Y */
1925};
1926
1927/* Unlinked list predecessor lookup hashtable construction */
1928static int
1929xfs_iunlink_obj_cmpfn(
1930	struct rhashtable_compare_arg	*arg,
1931	const void			*obj)
1932{
1933	const xfs_agino_t		*key = arg->key;
1934	const struct xfs_iunlink	*iu = obj;
1935
1936	if (iu->iu_next_unlinked != *key)
1937		return 1;
1938	return 0;
1939}
1940
1941static const struct rhashtable_params xfs_iunlink_hash_params = {
1942	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
1943	.key_len		= sizeof(xfs_agino_t),
1944	.key_offset		= offsetof(struct xfs_iunlink,
1945					   iu_next_unlinked),
1946	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
1947	.automatic_shrinking	= true,
1948	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
1949};
1950
1951/*
1952 * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
1953 * relation is found.
1954 */
1955static xfs_agino_t
1956xfs_iunlink_lookup_backref(
1957	struct xfs_perag	*pag,
1958	xfs_agino_t		agino)
1959{
1960	struct xfs_iunlink	*iu;
1961
1962	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
1963			xfs_iunlink_hash_params);
1964	return iu ? iu->iu_agino : NULLAGINO;
1965}
1966
1967/*
1968 * Take ownership of an iunlink cache entry and insert it into the hash table.
1969 * If successful, the entry will be owned by the cache; if not, it is freed.
1970 * Either way, the caller does not own @iu after this call.
1971 */
1972static int
1973xfs_iunlink_insert_backref(
1974	struct xfs_perag	*pag,
1975	struct xfs_iunlink	*iu)
1976{
1977	int			error;
1978
1979	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
1980			&iu->iu_rhash_head, xfs_iunlink_hash_params);
1981	/*
1982	 * Fail loudly if there already was an entry because that's a sign of
1983	 * corruption of in-memory data.  Also fail loudly if we see an error
1984	 * code we didn't anticipate from the rhashtable code.  Currently we
1985	 * only anticipate ENOMEM.
1986	 */
1987	if (error) {
1988		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
1989		kmem_free(iu);
1990	}
1991	/*
1992	 * Absorb any runtime errors that aren't a result of corruption because
1993	 * this is a cache and we can always fall back to bucket list scanning.
1994	 */
1995	if (error != 0 && error != -EEXIST)
1996		error = 0;
1997	return error;
1998}
1999
2000/* Remember that @prev_agino.next_unlinked = @this_agino. */
2001static int
2002xfs_iunlink_add_backref(
2003	struct xfs_perag	*pag,
2004	xfs_agino_t		prev_agino,
2005	xfs_agino_t		this_agino)
2006{
2007	struct xfs_iunlink	*iu;
2008
2009	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
2010		return 0;
2011
2012	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
2013	iu->iu_agino = prev_agino;
2014	iu->iu_next_unlinked = this_agino;
2015
2016	return xfs_iunlink_insert_backref(pag, iu);
2017}
2018
2019/*
2020 * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
2021 * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
2022 * wasn't any such entry then we don't bother.
2023 */
2024static int
2025xfs_iunlink_change_backref(
2026	struct xfs_perag	*pag,
2027	xfs_agino_t		agino,
2028	xfs_agino_t		next_unlinked)
2029{
2030	struct xfs_iunlink	*iu;
2031	int			error;
2032
2033	/* Look up the old entry; if there wasn't one then exit. */
2034	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
2035			xfs_iunlink_hash_params);
2036	if (!iu)
2037		return 0;
2038
2039	/*
2040	 * Remove the entry.  This shouldn't ever return an error, but if we
2041	 * couldn't remove the old entry we don't want to add it again to the
2042	 * hash table, and if the entry disappeared on us then someone's
2043	 * violated the locking rules and we need to fail loudly.  Either way
2044	 * we cannot remove the inode because internal state is or would have
2045	 * been corrupt.
2046	 */
2047	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
2048			&iu->iu_rhash_head, xfs_iunlink_hash_params);
2049	if (error)
2050		return error;
2051
2052	/* If there is no new next entry just free our item and return. */
2053	if (next_unlinked == NULLAGINO) {
2054		kmem_free(iu);
2055		return 0;
2056	}
2057
2058	/* Update the entry and re-add it to the hash table. */
2059	iu->iu_next_unlinked = next_unlinked;
2060	return xfs_iunlink_insert_backref(pag, iu);
2061}
2062
2063/* Set up the in-core predecessor structures. */
2064int
2065xfs_iunlink_init(
2066	struct xfs_perag	*pag)
2067{
2068	return rhashtable_init(&pag->pagi_unlinked_hash,
2069			&xfs_iunlink_hash_params);
2070}
2071
2072/* Free the in-core predecessor structures. */
2073static void
2074xfs_iunlink_free_item(
2075	void			*ptr,
2076	void			*arg)
2077{
2078	struct xfs_iunlink	*iu = ptr;
2079	bool			*freed_anything = arg;
2080
2081	*freed_anything = true;
2082	kmem_free(iu);
2083}
2084
2085void
2086xfs_iunlink_destroy(
2087	struct xfs_perag	*pag)
2088{
2089	bool			freed_anything = false;
2090
2091	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
2092			xfs_iunlink_free_item, &freed_anything);
2093
2094	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
2095}
2096
2097/*
2098 * Point the AGI unlinked bucket at an inode and log the results.  The caller
2099 * is responsible for validating the old value.
2100 */
2101STATIC int
2102xfs_iunlink_update_bucket(
2103	struct xfs_trans	*tp,
2104	xfs_agnumber_t		agno,
2105	struct xfs_buf		*agibp,
2106	unsigned int		bucket_index,
2107	xfs_agino_t		new_agino)
2108{
2109	struct xfs_agi		*agi = agibp->b_addr;
2110	xfs_agino_t		old_value;
2111	int			offset;
2112
2113	ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
2114
2115	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2116	trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
2117			old_value, new_agino);
2118
2119	/*
2120	 * We should never find the head of the list already set to the value
2121	 * passed in because either we're adding or removing ourselves from the
2122	 * head of the list.
2123	 */
2124	if (old_value == new_agino) {
2125		xfs_buf_mark_corrupt(agibp);
2126		return -EFSCORRUPTED;
2127	}
2128
2129	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
2130	offset = offsetof(struct xfs_agi, agi_unlinked) +
2131			(sizeof(xfs_agino_t) * bucket_index);
2132	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
2133	return 0;
2134}
2135
2136/* Set an on-disk inode's next_unlinked pointer. */
2137STATIC void
2138xfs_iunlink_update_dinode(
2139	struct xfs_trans	*tp,
2140	xfs_agnumber_t		agno,
2141	xfs_agino_t		agino,
2142	struct xfs_buf		*ibp,
2143	struct xfs_dinode	*dip,
2144	struct xfs_imap		*imap,
2145	xfs_agino_t		next_agino)
2146{
2147	struct xfs_mount	*mp = tp->t_mountp;
2148	int			offset;
2149
2150	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2151
2152	trace_xfs_iunlink_update_dinode(mp, agno, agino,
2153			be32_to_cpu(dip->di_next_unlinked), next_agino);
2154
2155	dip->di_next_unlinked = cpu_to_be32(next_agino);
2156	offset = imap->im_boffset +
2157			offsetof(struct xfs_dinode, di_next_unlinked);
2158
2159	/* need to recalc the inode CRC if appropriate */
2160	xfs_dinode_calc_crc(mp, dip);
2161	xfs_trans_inode_buf(tp, ibp);
2162	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
2163}
2164
2165/* Set an in-core inode's unlinked pointer and return the old value. */
2166STATIC int
2167xfs_iunlink_update_inode(
2168	struct xfs_trans	*tp,
2169	struct xfs_inode	*ip,
2170	xfs_agnumber_t		agno,
2171	xfs_agino_t		next_agino,
2172	xfs_agino_t		*old_next_agino)
2173{
2174	struct xfs_mount	*mp = tp->t_mountp;
2175	struct xfs_dinode	*dip;
2176	struct xfs_buf		*ibp;
2177	xfs_agino_t		old_value;
2178	int			error;
2179
2180	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2181
2182	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
2183	if (error)
2184		return error;
2185
2186	/* Make sure the old pointer isn't garbage. */
2187	old_value = be32_to_cpu(dip->di_next_unlinked);
2188	if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
2189		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
2190				sizeof(*dip), __this_address);
2191		error = -EFSCORRUPTED;
2192		goto out;
2193	}
2194
2195	/*
2196	 * Since we're updating a linked list, we should never find that the
2197	 * current pointer is the same as the new value, unless we're
2198	 * terminating the list.
2199	 */
2200	*old_next_agino = old_value;
2201	if (old_value == next_agino) {
2202		if (next_agino != NULLAGINO) {
2203			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
2204					dip, sizeof(*dip), __this_address);
2205			error = -EFSCORRUPTED;
2206		}
2207		goto out;
2208	}
2209
2210	/* Ok, update the new pointer. */
2211	xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
2212			ibp, dip, &ip->i_imap, next_agino);
2213	return 0;
2214out:
2215	xfs_trans_brelse(tp, ibp);
2216	return error;
2217}
2218
2219/*
2220 * This is called when the inode's link count has gone to 0 or we are creating
2221 * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
2222 *
2223 * We place the on-disk inode on a list in the AGI.  It will be pulled from this
2224 * list when the inode is freed.
2225 */
2226STATIC int
2227xfs_iunlink(
2228	struct xfs_trans	*tp,
2229	struct xfs_inode	*ip)
2230{
2231	struct xfs_mount	*mp = tp->t_mountp;
2232	struct xfs_agi		*agi;
2233	struct xfs_buf		*agibp;
2234	xfs_agino_t		next_agino;
2235	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2236	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2237	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2238	int			error;
2239
2240	ASSERT(VFS_I(ip)->i_nlink == 0);
2241	ASSERT(VFS_I(ip)->i_mode != 0);
2242	trace_xfs_iunlink(ip);
2243
2244	/* Get the agi buffer first.  It ensures lock ordering on the list. */
2245	error = xfs_read_agi(mp, tp, agno, &agibp);
2246	if (error)
2247		return error;
2248	agi = agibp->b_addr;
2249
2250	/*
2251	 * Get the index into the agi hash table for the list this inode will
2252	 * go on.  Make sure the pointer isn't garbage and that this inode
2253	 * isn't already on the list.
2254	 */
2255	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2256	if (next_agino == agino ||
2257	    !xfs_verify_agino_or_null(mp, agno, next_agino)) {
2258		xfs_buf_mark_corrupt(agibp);
2259		return -EFSCORRUPTED;
2260	}
2261
2262	if (next_agino != NULLAGINO) {
2263		xfs_agino_t		old_agino;
2264
2265		/*
2266		 * There is already another inode in the bucket, so point this
2267		 * inode to the current head of the list.
2268		 */
2269		error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
2270				&old_agino);
2271		if (error)
2272			return error;
2273		ASSERT(old_agino == NULLAGINO);
2274
2275		/*
2276		 * agino has been unlinked, add a backref from the next inode
2277		 * back to agino.
2278		 */
2279		error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
2280		if (error)
2281			return error;
2282	}
2283
2284	/* Point the head of the list to point to this inode. */
2285	return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
2286}
2287
2288/* Return the imap, dinode pointer, and buffer for an inode. */
2289STATIC int
2290xfs_iunlink_map_ino(
2291	struct xfs_trans	*tp,
2292	xfs_agnumber_t		agno,
2293	xfs_agino_t		agino,
2294	struct xfs_imap		*imap,
2295	struct xfs_dinode	**dipp,
2296	struct xfs_buf		**bpp)
2297{
2298	struct xfs_mount	*mp = tp->t_mountp;
2299	int			error;
2300
2301	imap->im_blkno = 0;
2302	error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
2303	if (error) {
2304		xfs_warn(mp, "%s: xfs_imap returned error %d.",
2305				__func__, error);
2306		return error;
2307	}
2308
2309	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0);
2310	if (error) {
2311		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2312				__func__, error);
2313		return error;
2314	}
2315
2316	return 0;
2317}
2318
2319/*
2320 * Walk the unlinked chain from @head_agino until we find the inode that
2321 * points to @target_agino.  Return the inode number, map, dinode pointer,
2322 * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
2323 *
2324 * @tp, @pag, @head_agino, and @target_agino are input parameters.
2325 * @agino, @imap, @dipp, and @bpp are all output parameters.
2326 *
2327 * Do not call this function if @target_agino is the head of the list.
2328 */
2329STATIC int
2330xfs_iunlink_map_prev(
2331	struct xfs_trans	*tp,
2332	xfs_agnumber_t		agno,
2333	xfs_agino_t		head_agino,
2334	xfs_agino_t		target_agino,
2335	xfs_agino_t		*agino,
2336	struct xfs_imap		*imap,
2337	struct xfs_dinode	**dipp,
2338	struct xfs_buf		**bpp,
2339	struct xfs_perag	*pag)
2340{
2341	struct xfs_mount	*mp = tp->t_mountp;
2342	xfs_agino_t		next_agino;
2343	int			error;
2344
2345	ASSERT(head_agino != target_agino);
2346	*bpp = NULL;
2347
2348	/* See if our backref cache can find it faster. */
2349	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
2350	if (*agino != NULLAGINO) {
2351		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
2352		if (error)
2353			return error;
2354
2355		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
2356			return 0;
2357
2358		/*
2359		 * If we get here the cache contents were corrupt, so drop the
2360		 * buffer and fall back to walking the bucket list.
2361		 */
2362		xfs_trans_brelse(tp, *bpp);
2363		*bpp = NULL;
2364		WARN_ON_ONCE(1);
2365	}
2366
2367	trace_xfs_iunlink_map_prev_fallback(mp, agno);
2368
2369	/* Otherwise, walk the entire bucket until we find it. */
2370	next_agino = head_agino;
2371	while (next_agino != target_agino) {
2372		xfs_agino_t	unlinked_agino;
2373
2374		if (*bpp)
2375			xfs_trans_brelse(tp, *bpp);
2376
2377		*agino = next_agino;
2378		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
2379				bpp);
2380		if (error)
2381			return error;
2382
2383		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
2384		/*
2385		 * Make sure this pointer is valid and isn't an obvious
2386		 * infinite loop.
2387		 */
2388		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
2389		    next_agino == unlinked_agino) {
2390			XFS_CORRUPTION_ERROR(__func__,
2391					XFS_ERRLEVEL_LOW, mp,
2392					*dipp, sizeof(**dipp));
2393			error = -EFSCORRUPTED;
2394			return error;
2395		}
2396		next_agino = unlinked_agino;
2397	}
2398
2399	return 0;
2400}
2401
2402/*
2403 * Pull the on-disk inode from the AGI unlinked list.
2404 */
2405STATIC int
2406xfs_iunlink_remove(
2407	struct xfs_trans	*tp,
2408	struct xfs_inode	*ip)
2409{
2410	struct xfs_mount	*mp = tp->t_mountp;
2411	struct xfs_agi		*agi;
2412	struct xfs_buf		*agibp;
2413	struct xfs_buf		*last_ibp;
2414	struct xfs_dinode	*last_dip = NULL;
2415	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2416	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2417	xfs_agino_t		next_agino;
2418	xfs_agino_t		head_agino;
2419	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2420	int			error;
2421
2422	trace_xfs_iunlink_remove(ip);
2423
2424	/* Get the agi buffer first.  It ensures lock ordering on the list. */
2425	error = xfs_read_agi(mp, tp, agno, &agibp);
2426	if (error)
2427		return error;
2428	agi = agibp->b_addr;
2429
2430	/*
2431	 * Get the index into the agi hash table for the list this inode will
2432	 * go on.  Make sure the head pointer isn't garbage.
2433	 */
2434	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2435	if (!xfs_verify_agino(mp, agno, head_agino)) {
2436		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2437				agi, sizeof(*agi));
2438		return -EFSCORRUPTED;
2439	}
2440
2441	/*
2442	 * Set our inode's next_unlinked pointer to NULL and then return
2443	 * the old pointer value so that we can update whatever was previous
2444	 * to us in the list to point to whatever was next in the list.
2445	 */
2446	error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
2447	if (error)
2448		return error;
2449
2450	/*
2451	 * If there was a backref pointing from the next inode back to this
2452	 * one, remove it because we've removed this inode from the list.
2453	 *
2454	 * Later, if this inode was in the middle of the list we'll update
2455	 * this inode's backref to point from the next inode.
2456	 */
2457	if (next_agino != NULLAGINO) {
2458		error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
2459				NULLAGINO);
2460		if (error)
2461			return error;
2462	}
2463
2464	if (head_agino != agino) {
2465		struct xfs_imap	imap;
2466		xfs_agino_t	prev_agino;
2467
2468		/* We need to search the list for the inode being freed. */
2469		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
2470				&prev_agino, &imap, &last_dip, &last_ibp,
2471				agibp->b_pag);
2472		if (error)
2473			return error;
2474
2475		/* Point the previous inode on the list to the next inode. */
2476		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
2477				last_dip, &imap, next_agino);
2478
2479		/*
2480		 * Now we deal with the backref for this inode.  If this inode
2481		 * pointed at a real inode, change the backref that pointed to
2482		 * us to point to our old next.  If this inode was the end of
2483		 * the list, delete the backref that pointed to us.  Note that
2484		 * change_backref takes care of deleting the backref if
2485		 * next_agino is NULLAGINO.
2486		 */
2487		return xfs_iunlink_change_backref(agibp->b_pag, agino,
2488				next_agino);
2489	}
2490
2491	/* Point the head of the list to the next unlinked inode. */
2492	return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
2493			next_agino);
2494}
2495
2496/*
2497 * Look up the inode number specified and if it is not already marked XFS_ISTALE
2498 * mark it stale. We should only find clean inodes in this lookup that aren't
2499 * already stale.
2500 */
2501static void
2502xfs_ifree_mark_inode_stale(
2503	struct xfs_buf		*bp,
2504	struct xfs_inode	*free_ip,
2505	xfs_ino_t		inum)
2506{
2507	struct xfs_mount	*mp = bp->b_mount;
2508	struct xfs_perag	*pag = bp->b_pag;
2509	struct xfs_inode_log_item *iip;
2510	struct xfs_inode	*ip;
2511
2512retry:
2513	rcu_read_lock();
2514	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2515
2516	/* Inode not in memory, nothing to do */
2517	if (!ip) {
2518		rcu_read_unlock();
2519		return;
2520	}
2521
2522	/*
2523	 * because this is an RCU protected lookup, we could find a recently
2524	 * freed or even reallocated inode during the lookup. We need to check
2525	 * under the i_flags_lock for a valid inode here. Skip it if it is not
2526	 * valid, the wrong inode or stale.
2527	 */
2528	spin_lock(&ip->i_flags_lock);
2529	if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2530		goto out_iflags_unlock;
2531
2532	/*
2533	 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2534	 * other inodes that we did not find in the list attached to the buffer
2535	 * and are not already marked stale. If we can't lock it, back off and
2536	 * retry.
2537	 */
2538	if (ip != free_ip) {
2539		if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2540			spin_unlock(&ip->i_flags_lock);
2541			rcu_read_unlock();
2542			delay(1);
2543			goto retry;
2544		}
2545	}
2546	ip->i_flags |= XFS_ISTALE;
2547
2548	/*
2549	 * If the inode is flushing, it is already attached to the buffer.  All
2550	 * we needed to do here is mark the inode stale so buffer IO completion
2551	 * will remove it from the AIL.
2552	 */
2553	iip = ip->i_itemp;
2554	if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2555		ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2556		ASSERT(iip->ili_last_fields);
2557		goto out_iunlock;
2558	}
2559
2560	/*
2561	 * Inodes not attached to the buffer can be released immediately.
2562	 * Everything else has to go through xfs_iflush_abort() on journal
2563	 * commit as the flock synchronises removal of the inode from the
2564	 * cluster buffer against inode reclaim.
2565	 */
2566	if (!iip || list_empty(&iip->ili_item.li_bio_list))
2567		goto out_iunlock;
2568
2569	__xfs_iflags_set(ip, XFS_IFLUSHING);
2570	spin_unlock(&ip->i_flags_lock);
2571	rcu_read_unlock();
2572
2573	/* we have a dirty inode in memory that has not yet been flushed. */
2574	spin_lock(&iip->ili_lock);
2575	iip->ili_last_fields = iip->ili_fields;
2576	iip->ili_fields = 0;
2577	iip->ili_fsync_fields = 0;
2578	spin_unlock(&iip->ili_lock);
2579	ASSERT(iip->ili_last_fields);
2580
2581	if (ip != free_ip)
2582		xfs_iunlock(ip, XFS_ILOCK_EXCL);
2583	return;
2584
2585out_iunlock:
2586	if (ip != free_ip)
2587		xfs_iunlock(ip, XFS_ILOCK_EXCL);
2588out_iflags_unlock:
2589	spin_unlock(&ip->i_flags_lock);
2590	rcu_read_unlock();
2591}
2592
2593/*
2594 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2595 * inodes that are in memory - they all must be marked stale and attached to
2596 * the cluster buffer.
2597 */
2598STATIC int
2599xfs_ifree_cluster(
2600	struct xfs_inode	*free_ip,
2601	struct xfs_trans	*tp,
2602	struct xfs_icluster	*xic)
2603{
2604	struct xfs_mount	*mp = free_ip->i_mount;
2605	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
2606	struct xfs_buf		*bp;
2607	xfs_daddr_t		blkno;
2608	xfs_ino_t		inum = xic->first_ino;
2609	int			nbufs;
2610	int			i, j;
2611	int			ioffset;
2612	int			error;
2613
2614	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2615
2616	for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2617		/*
2618		 * The allocation bitmap tells us which inodes of the chunk were
2619		 * physically allocated. Skip the cluster if an inode falls into
2620		 * a sparse region.
2621		 */
2622		ioffset = inum - xic->first_ino;
2623		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2624			ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2625			continue;
2626		}
2627
2628		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2629					 XFS_INO_TO_AGBNO(mp, inum));
2630
2631		/*
2632		 * We obtain and lock the backing buffer first in the process
2633		 * here to ensure dirty inodes attached to the buffer remain in
2634		 * the flushing state while we mark them stale.
2635		 *
2636		 * If we scan the in-memory inodes first, then buffer IO can
2637		 * complete before we get a lock on it, and hence we may fail
2638		 * to mark all the active inodes on the buffer stale.
2639		 */
2640		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2641				mp->m_bsize * igeo->blocks_per_cluster,
2642				XBF_UNMAPPED, &bp);
2643		if (error)
2644			return error;
2645
2646		/*
2647		 * This buffer may not have been correctly initialised as we
2648		 * didn't read it from disk. That's not important because we are
2649		 * only using to mark the buffer as stale in the log, and to
2650		 * attach stale cached inodes on it. That means it will never be
2651		 * dispatched for IO. If it is, we want to know about it, and we
2652		 * want it to fail. We can acheive this by adding a write
2653		 * verifier to the buffer.
2654		 */
2655		bp->b_ops = &xfs_inode_buf_ops;
2656
2657		/*
2658		 * Now we need to set all the cached clean inodes as XFS_ISTALE,
2659		 * too. This requires lookups, and will skip inodes that we've
2660		 * already marked XFS_ISTALE.
2661		 */
2662		for (i = 0; i < igeo->inodes_per_cluster; i++)
2663			xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
2664
2665		xfs_trans_stale_inode_buf(tp, bp);
2666		xfs_trans_binval(tp, bp);
2667	}
2668	return 0;
2669}
2670
2671/*
2672 * This is called to return an inode to the inode free list.  The inode should
2673 * already be truncated to 0 length and have no pages associated with it.  This
2674 * routine also assumes that the inode is already a part of the transaction.
2675 *
2676 * The on-disk copy of the inode will have been added to the list of unlinked
2677 * inodes in the AGI. We need to remove the inode from that list atomically with
2678 * respect to freeing it here.
2679 */
2680int
2681xfs_ifree(
2682	struct xfs_trans	*tp,
2683	struct xfs_inode	*ip)
2684{
2685	int			error;
2686	struct xfs_icluster	xic = { 0 };
2687	struct xfs_inode_log_item *iip = ip->i_itemp;
2688
2689	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2690	ASSERT(VFS_I(ip)->i_nlink == 0);
2691	ASSERT(ip->i_df.if_nextents == 0);
2692	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2693	ASSERT(ip->i_d.di_nblocks == 0);
2694
2695	/*
2696	 * Free the inode first so that we guarantee that the AGI lock is going
2697	 * to be taken before we remove the inode from the unlinked list. This
2698	 * makes the AGI lock -> unlinked list modification order the same as
2699	 * used in O_TMPFILE creation.
2700	 */
2701	error = xfs_difree(tp, ip->i_ino, &xic);
2702	if (error)
2703		return error;
2704
2705	error = xfs_iunlink_remove(tp, ip);
2706	if (error)
2707		return error;
2708
2709	/*
2710	 * Free any local-format data sitting around before we reset the
2711	 * data fork to extents format.  Note that the attr fork data has
2712	 * already been freed by xfs_attr_inactive.
2713	 */
2714	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2715		kmem_free(ip->i_df.if_u1.if_data);
2716		ip->i_df.if_u1.if_data = NULL;
2717		ip->i_df.if_bytes = 0;
2718	}
2719
2720	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
2721	ip->i_d.di_flags = 0;
2722	ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2;
2723	ip->i_d.di_dmevmask = 0;
2724	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
2725	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2726
2727	/* Don't attempt to replay owner changes for a deleted inode */
2728	spin_lock(&iip->ili_lock);
2729	iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2730	spin_unlock(&iip->ili_lock);
2731
2732	/*
2733	 * Bump the generation count so no one will be confused
2734	 * by reincarnations of this inode.
2735	 */
2736	VFS_I(ip)->i_generation++;
2737	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2738
2739	if (xic.deleted)
2740		error = xfs_ifree_cluster(ip, tp, &xic);
2741
2742	return error;
2743}
2744
2745/*
2746 * This is called to unpin an inode.  The caller must have the inode locked
2747 * in at least shared mode so that the buffer cannot be subsequently pinned
2748 * once someone is waiting for it to be unpinned.
2749 */
2750static void
2751xfs_iunpin(
2752	struct xfs_inode	*ip)
2753{
2754	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2755
2756	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2757
2758	/* Give the log a push to start the unpinning I/O */
2759	xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2760
2761}
2762
2763static void
2764__xfs_iunpin_wait(
2765	struct xfs_inode	*ip)
2766{
2767	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2768	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2769
2770	xfs_iunpin(ip);
2771
2772	do {
2773		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2774		if (xfs_ipincount(ip))
2775			io_schedule();
2776	} while (xfs_ipincount(ip));
2777	finish_wait(wq, &wait.wq_entry);
2778}
2779
2780void
2781xfs_iunpin_wait(
2782	struct xfs_inode	*ip)
2783{
2784	if (xfs_ipincount(ip))
2785		__xfs_iunpin_wait(ip);
2786}
2787
2788/*
2789 * Removing an inode from the namespace involves removing the directory entry
2790 * and dropping the link count on the inode. Removing the directory entry can
2791 * result in locking an AGF (directory blocks were freed) and removing a link
2792 * count can result in placing the inode on an unlinked list which results in
2793 * locking an AGI.
2794 *
2795 * The big problem here is that we have an ordering constraint on AGF and AGI
2796 * locking - inode allocation locks the AGI, then can allocate a new extent for
2797 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2798 * removes the inode from the unlinked list, requiring that we lock the AGI
2799 * first, and then freeing the inode can result in an inode chunk being freed
2800 * and hence freeing disk space requiring that we lock an AGF.
2801 *
2802 * Hence the ordering that is imposed by other parts of the code is AGI before
2803 * AGF. This means we cannot remove the directory entry before we drop the inode
2804 * reference count and put it on the unlinked list as this results in a lock
2805 * order of AGF then AGI, and this can deadlock against inode allocation and
2806 * freeing. Therefore we must drop the link counts before we remove the
2807 * directory entry.
2808 *
2809 * This is still safe from a transactional point of view - it is not until we
2810 * get to xfs_defer_finish() that we have the possibility of multiple
2811 * transactions in this operation. Hence as long as we remove the directory
2812 * entry and drop the link count in the first transaction of the remove
2813 * operation, there are no transactional constraints on the ordering here.
2814 */
2815int
2816xfs_remove(
2817	xfs_inode_t             *dp,
2818	struct xfs_name		*name,
2819	xfs_inode_t		*ip)
2820{
2821	xfs_mount_t		*mp = dp->i_mount;
2822	xfs_trans_t             *tp = NULL;
2823	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2824	int                     error = 0;
2825	uint			resblks;
2826
2827	trace_xfs_remove(dp, name);
2828
2829	if (XFS_FORCED_SHUTDOWN(mp))
2830		return -EIO;
2831
2832	error = xfs_qm_dqattach(dp);
2833	if (error)
2834		goto std_return;
2835
2836	error = xfs_qm_dqattach(ip);
2837	if (error)
2838		goto std_return;
2839
2840	/*
2841	 * We try to get the real space reservation first,
2842	 * allowing for directory btree deletion(s) implying
2843	 * possible bmap insert(s).  If we can't get the space
2844	 * reservation then we use 0 instead, and avoid the bmap
2845	 * btree insert(s) in the directory code by, if the bmap
2846	 * insert tries to happen, instead trimming the LAST
2847	 * block from the directory.
2848	 */
2849	resblks = XFS_REMOVE_SPACE_RES(mp);
2850	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
2851	if (error == -ENOSPC) {
2852		resblks = 0;
2853		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
2854				&tp);
2855	}
2856	if (error) {
2857		ASSERT(error != -ENOSPC);
2858		goto std_return;
2859	}
2860
2861	xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
2862
2863	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2864	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2865
2866	/*
2867	 * If we're removing a directory perform some additional validation.
2868	 */
2869	if (is_dir) {
2870		ASSERT(VFS_I(ip)->i_nlink >= 2);
2871		if (VFS_I(ip)->i_nlink != 2) {
2872			error = -ENOTEMPTY;
2873			goto out_trans_cancel;
2874		}
2875		if (!xfs_dir_isempty(ip)) {
2876			error = -ENOTEMPTY;
2877			goto out_trans_cancel;
2878		}
2879
2880		/* Drop the link from ip's "..".  */
2881		error = xfs_droplink(tp, dp);
2882		if (error)
2883			goto out_trans_cancel;
2884
2885		/* Drop the "." link from ip to self.  */
2886		error = xfs_droplink(tp, ip);
2887		if (error)
2888			goto out_trans_cancel;
2889	} else {
2890		/*
2891		 * When removing a non-directory we need to log the parent
2892		 * inode here.  For a directory this is done implicitly
2893		 * by the xfs_droplink call for the ".." entry.
2894		 */
2895		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2896	}
2897	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2898
2899	/* Drop the link from dp to ip. */
2900	error = xfs_droplink(tp, ip);
2901	if (error)
2902		goto out_trans_cancel;
2903
2904	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2905	if (error) {
2906		ASSERT(error != -ENOENT);
2907		goto out_trans_cancel;
2908	}
2909
2910	/*
2911	 * If this is a synchronous mount, make sure that the
2912	 * remove transaction goes to disk before returning to
2913	 * the user.
2914	 */
2915	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2916		xfs_trans_set_sync(tp);
2917
2918	error = xfs_trans_commit(tp);
2919	if (error)
2920		goto std_return;
2921
2922	if (is_dir && xfs_inode_is_filestream(ip))
2923		xfs_filestream_deassociate(ip);
2924
2925	return 0;
2926
2927 out_trans_cancel:
2928	xfs_trans_cancel(tp);
2929 std_return:
2930	return error;
2931}
2932
2933/*
2934 * Enter all inodes for a rename transaction into a sorted array.
2935 */
2936#define __XFS_SORT_INODES	5
2937STATIC void
2938xfs_sort_for_rename(
2939	struct xfs_inode	*dp1,	/* in: old (source) directory inode */
2940	struct xfs_inode	*dp2,	/* in: new (target) directory inode */
2941	struct xfs_inode	*ip1,	/* in: inode of old entry */
2942	struct xfs_inode	*ip2,	/* in: inode of new entry */
2943	struct xfs_inode	*wip,	/* in: whiteout inode */
2944	struct xfs_inode	**i_tab,/* out: sorted array of inodes */
2945	int			*num_inodes)  /* in/out: inodes in array */
2946{
2947	int			i, j;
2948
2949	ASSERT(*num_inodes == __XFS_SORT_INODES);
2950	memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2951
2952	/*
2953	 * i_tab contains a list of pointers to inodes.  We initialize
2954	 * the table here & we'll sort it.  We will then use it to
2955	 * order the acquisition of the inode locks.
2956	 *
2957	 * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2958	 */
2959	i = 0;
2960	i_tab[i++] = dp1;
2961	i_tab[i++] = dp2;
2962	i_tab[i++] = ip1;
2963	if (ip2)
2964		i_tab[i++] = ip2;
2965	if (wip)
2966		i_tab[i++] = wip;
2967	*num_inodes = i;
2968
2969	/*
2970	 * Sort the elements via bubble sort.  (Remember, there are at
2971	 * most 5 elements to sort, so this is adequate.)
2972	 */
2973	for (i = 0; i < *num_inodes; i++) {
2974		for (j = 1; j < *num_inodes; j++) {
2975			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2976				struct xfs_inode *temp = i_tab[j];
2977				i_tab[j] = i_tab[j-1];
2978				i_tab[j-1] = temp;
2979			}
2980		}
2981	}
2982}
2983
2984static int
2985xfs_finish_rename(
2986	struct xfs_trans	*tp)
2987{
2988	/*
2989	 * If this is a synchronous mount, make sure that the rename transaction
2990	 * goes to disk before returning to the user.
2991	 */
2992	if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2993		xfs_trans_set_sync(tp);
2994
2995	return xfs_trans_commit(tp);
2996}
2997
2998/*
2999 * xfs_cross_rename()
3000 *
3001 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
3002 */
3003STATIC int
3004xfs_cross_rename(
3005	struct xfs_trans	*tp,
3006	struct xfs_inode	*dp1,
3007	struct xfs_name		*name1,
3008	struct xfs_inode	*ip1,
3009	struct xfs_inode	*dp2,
3010	struct xfs_name		*name2,
3011	struct xfs_inode	*ip2,
3012	int			spaceres)
3013{
3014	int		error = 0;
3015	int		ip1_flags = 0;
3016	int		ip2_flags = 0;
3017	int		dp2_flags = 0;
3018
3019	/* Swap inode number for dirent in first parent */
3020	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
3021	if (error)
3022		goto out_trans_abort;
3023
3024	/* Swap inode number for dirent in second parent */
3025	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
3026	if (error)
3027		goto out_trans_abort;
3028
3029	/*
3030	 * If we're renaming one or more directories across different parents,
3031	 * update the respective ".." entries (and link counts) to match the new
3032	 * parents.
3033	 */
3034	if (dp1 != dp2) {
3035		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3036
3037		if (S_ISDIR(VFS_I(ip2)->i_mode)) {
3038			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
3039						dp1->i_ino, spaceres);
3040			if (error)
3041				goto out_trans_abort;
3042
3043			/* transfer ip2 ".." reference to dp1 */
3044			if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
3045				error = xfs_droplink(tp, dp2);
3046				if (error)
3047					goto out_trans_abort;
3048				xfs_bumplink(tp, dp1);
3049			}
3050
3051			/*
3052			 * Although ip1 isn't changed here, userspace needs
3053			 * to be warned about the change, so that applications
3054			 * relying on it (like backup ones), will properly
3055			 * notify the change
3056			 */
3057			ip1_flags |= XFS_ICHGTIME_CHG;
3058			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3059		}
3060
3061		if (S_ISDIR(VFS_I(ip1)->i_mode)) {
3062			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
3063						dp2->i_ino, spaceres);
3064			if (error)
3065				goto out_trans_abort;
3066
3067			/* transfer ip1 ".." reference to dp2 */
3068			if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
3069				error = xfs_droplink(tp, dp1);
3070				if (error)
3071					goto out_trans_abort;
3072				xfs_bumplink(tp, dp2);
3073			}
3074
3075			/*
3076			 * Although ip2 isn't changed here, userspace needs
3077			 * to be warned about the change, so that applications
3078			 * relying on it (like backup ones), will properly
3079			 * notify the change
3080			 */
3081			ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3082			ip2_flags |= XFS_ICHGTIME_CHG;
3083		}
3084	}
3085
3086	if (ip1_flags) {
3087		xfs_trans_ichgtime(tp, ip1, ip1_flags);
3088		xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
3089	}
3090	if (ip2_flags) {
3091		xfs_trans_ichgtime(tp, ip2, ip2_flags);
3092		xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
3093	}
3094	if (dp2_flags) {
3095		xfs_trans_ichgtime(tp, dp2, dp2_flags);
3096		xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
3097	}
3098	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3099	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
3100	return xfs_finish_rename(tp);
3101
3102out_trans_abort:
3103	xfs_trans_cancel(tp);
3104	return error;
3105}
3106
3107/*
3108 * xfs_rename_alloc_whiteout()
3109 *
3110 * Return a referenced, unlinked, unlocked inode that can be used as a
3111 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
3112 * crash between allocating the inode and linking it into the rename transaction
3113 * recovery will free the inode and we won't leak it.
3114 */
3115static int
3116xfs_rename_alloc_whiteout(
3117	struct xfs_inode	*dp,
3118	struct xfs_inode	**wip)
3119{
3120	struct xfs_inode	*tmpfile;
3121	int			error;
3122
3123	error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
3124	if (error)
3125		return error;
3126
3127	/*
3128	 * Prepare the tmpfile inode as if it were created through the VFS.
3129	 * Complete the inode setup and flag it as linkable.  nlink is already
3130	 * zero, so we can skip the drop_nlink.
3131	 */
3132	xfs_setup_iops(tmpfile);
3133	xfs_finish_inode_setup(tmpfile);
3134	VFS_I(tmpfile)->i_state |= I_LINKABLE;
3135
3136	*wip = tmpfile;
3137	return 0;
3138}
3139
3140/*
3141 * xfs_rename
3142 */
3143int
3144xfs_rename(
3145	struct xfs_inode	*src_dp,
3146	struct xfs_name		*src_name,
3147	struct xfs_inode	*src_ip,
3148	struct xfs_inode	*target_dp,
3149	struct xfs_name		*target_name,
3150	struct xfs_inode	*target_ip,
3151	unsigned int		flags)
3152{
3153	struct xfs_mount	*mp = src_dp->i_mount;
3154	struct xfs_trans	*tp;
3155	struct xfs_inode	*wip = NULL;		/* whiteout inode */
3156	struct xfs_inode	*inodes[__XFS_SORT_INODES];
3157	int			i;
3158	int			num_inodes = __XFS_SORT_INODES;
3159	bool			new_parent = (src_dp != target_dp);
3160	bool			src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
3161	int			spaceres;
3162	int			error;
3163
3164	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
3165
3166	if ((flags & RENAME_EXCHANGE) && !target_ip)
3167		return -EINVAL;
3168
3169	/*
3170	 * If we are doing a whiteout operation, allocate the whiteout inode
3171	 * we will be placing at the target and ensure the type is set
3172	 * appropriately.
3173	 */
3174	if (flags & RENAME_WHITEOUT) {
3175		error = xfs_rename_alloc_whiteout(target_dp, &wip);
3176		if (error)
3177			return error;
3178
3179		/* setup target dirent info as whiteout */
3180		src_name->type = XFS_DIR3_FT_CHRDEV;
3181	}
3182
3183	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
3184				inodes, &num_inodes);
3185
3186	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
3187	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
3188	if (error == -ENOSPC) {
3189		spaceres = 0;
3190		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
3191				&tp);
3192	}
3193	if (error)
3194		goto out_release_wip;
3195
3196	/*
3197	 * Attach the dquots to the inodes
3198	 */
3199	error = xfs_qm_vop_rename_dqattach(inodes);
3200	if (error)
3201		goto out_trans_cancel;
3202
3203	/*
3204	 * Lock all the participating inodes. Depending upon whether
3205	 * the target_name exists in the target directory, and
3206	 * whether the target directory is the same as the source
3207	 * directory, we can lock from 2 to 4 inodes.
3208	 */
3209	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
3210
3211	/*
3212	 * Join all the inodes to the transaction. From this point on,
3213	 * we can rely on either trans_commit or trans_cancel to unlock
3214	 * them.
3215	 */
3216	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
3217	if (new_parent)
3218		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
3219	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
3220	if (target_ip)
3221		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
3222	if (wip)
3223		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
3224
3225	/*
3226	 * If we are using project inheritance, we only allow renames
3227	 * into our tree when the project IDs are the same; else the
3228	 * tree quota mechanism would be circumvented.
3229	 */
3230	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
3231		     target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
3232		error = -EXDEV;
3233		goto out_trans_cancel;
3234	}
3235
3236	/* RENAME_EXCHANGE is unique from here on. */
3237	if (flags & RENAME_EXCHANGE)
3238		return xfs_cross_rename(tp, src_dp, src_name, src_ip,
3239					target_dp, target_name, target_ip,
3240					spaceres);
3241
3242	/*
3243	 * Check for expected errors before we dirty the transaction
3244	 * so we can return an error without a transaction abort.
3245	 */
3246	if (target_ip == NULL) {
3247		/*
3248		 * If there's no space reservation, check the entry will
3249		 * fit before actually inserting it.
3250		 */
3251		if (!spaceres) {
3252			error = xfs_dir_canenter(tp, target_dp, target_name);
3253			if (error)
3254				goto out_trans_cancel;
3255		}
3256	} else {
3257		/*
3258		 * If target exists and it's a directory, check that whether
3259		 * it can be destroyed.
3260		 */
3261		if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
3262		    (!xfs_dir_isempty(target_ip) ||
3263		     (VFS_I(target_ip)->i_nlink > 2))) {
3264			error = -EEXIST;
3265			goto out_trans_cancel;
3266		}
3267	}
3268
3269	/*
3270	 * Lock the AGI buffers we need to handle bumping the nlink of the
3271	 * whiteout inode off the unlinked list and to handle dropping the
3272	 * nlink of the target inode.  Per locking order rules, do this in
3273	 * increasing AG order and before directory block allocation tries to
3274	 * grab AGFs because we grab AGIs before AGFs.
3275	 *
3276	 * The (vfs) caller must ensure that if src is a directory then
3277	 * target_ip is either null or an empty directory.
3278	 */
3279	for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
3280		if (inodes[i] == wip ||
3281		    (inodes[i] == target_ip &&
3282		     (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
3283			struct xfs_buf	*bp;
3284			xfs_agnumber_t	agno;
3285
3286			agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
3287			error = xfs_read_agi(mp, tp, agno, &bp);
3288			if (error)
3289				goto out_trans_cancel;
3290		}
3291	}
3292
3293	/*
3294	 * Directory entry creation below may acquire the AGF. Remove
3295	 * the whiteout from the unlinked list first to preserve correct
3296	 * AGI/AGF locking order. This dirties the transaction so failures
3297	 * after this point will abort and log recovery will clean up the
3298	 * mess.
3299	 *
3300	 * For whiteouts, we need to bump the link count on the whiteout
3301	 * inode. After this point, we have a real link, clear the tmpfile
3302	 * state flag from the inode so it doesn't accidentally get misused
3303	 * in future.
3304	 */
3305	if (wip) {
3306		ASSERT(VFS_I(wip)->i_nlink == 0);
3307		error = xfs_iunlink_remove(tp, wip);
3308		if (error)
3309			goto out_trans_cancel;
3310
3311		xfs_bumplink(tp, wip);
3312		VFS_I(wip)->i_state &= ~I_LINKABLE;
3313	}
3314
3315	/*
3316	 * Set up the target.
3317	 */
3318	if (target_ip == NULL) {
3319		/*
3320		 * If target does not exist and the rename crosses
3321		 * directories, adjust the target directory link count
3322		 * to account for the ".." reference from the new entry.
3323		 */
3324		error = xfs_dir_createname(tp, target_dp, target_name,
3325					   src_ip->i_ino, spaceres);
3326		if (error)
3327			goto out_trans_cancel;
3328
3329		xfs_trans_ichgtime(tp, target_dp,
3330					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3331
3332		if (new_parent && src_is_directory) {
3333			xfs_bumplink(tp, target_dp);
3334		}
3335	} else { /* target_ip != NULL */
3336		/*
3337		 * Link the source inode under the target name.
3338		 * If the source inode is a directory and we are moving
3339		 * it across directories, its ".." entry will be
3340		 * inconsistent until we replace that down below.
3341		 *
3342		 * In case there is already an entry with the same
3343		 * name at the destination directory, remove it first.
3344		 */
3345		error = xfs_dir_replace(tp, target_dp, target_name,
3346					src_ip->i_ino, spaceres);
3347		if (error)
3348			goto out_trans_cancel;
3349
3350		xfs_trans_ichgtime(tp, target_dp,
3351					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3352
3353		/*
3354		 * Decrement the link count on the target since the target
3355		 * dir no longer points to it.
3356		 */
3357		error = xfs_droplink(tp, target_ip);
3358		if (error)
3359			goto out_trans_cancel;
3360
3361		if (src_is_directory) {
3362			/*
3363			 * Drop the link from the old "." entry.
3364			 */
3365			error = xfs_droplink(tp, target_ip);
3366			if (error)
3367				goto out_trans_cancel;
3368		}
3369	} /* target_ip != NULL */
3370
3371	/*
3372	 * Remove the source.
3373	 */
3374	if (new_parent && src_is_directory) {
3375		/*
3376		 * Rewrite the ".." entry to point to the new
3377		 * directory.
3378		 */
3379		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3380					target_dp->i_ino, spaceres);
3381		ASSERT(error != -EEXIST);
3382		if (error)
3383			goto out_trans_cancel;
3384	}
3385
3386	/*
3387	 * We always want to hit the ctime on the source inode.
3388	 *
3389	 * This isn't strictly required by the standards since the source
3390	 * inode isn't really being changed, but old unix file systems did
3391	 * it and some incremental backup programs won't work without it.
3392	 */
3393	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3394	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3395
3396	/*
3397	 * Adjust the link count on src_dp.  This is necessary when
3398	 * renaming a directory, either within one parent when
3399	 * the target existed, or across two parent directories.
3400	 */
3401	if (src_is_directory && (new_parent || target_ip != NULL)) {
3402
3403		/*
3404		 * Decrement link count on src_directory since the
3405		 * entry that's moved no longer points to it.
3406		 */
3407		error = xfs_droplink(tp, src_dp);
3408		if (error)
3409			goto out_trans_cancel;
3410	}
3411
3412	/*
3413	 * For whiteouts, we only need to update the source dirent with the
3414	 * inode number of the whiteout inode rather than removing it
3415	 * altogether.
3416	 */
3417	if (wip) {
3418		error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3419					spaceres);
3420	} else
3421		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3422					   spaceres);
3423	if (error)
3424		goto out_trans_cancel;
3425
3426	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3427	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3428	if (new_parent)
3429		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3430
3431	error = xfs_finish_rename(tp);
3432	if (wip)
3433		xfs_irele(wip);
3434	return error;
3435
3436out_trans_cancel:
3437	xfs_trans_cancel(tp);
3438out_release_wip:
3439	if (wip)
3440		xfs_irele(wip);
3441	return error;
3442}
3443
3444static int
3445xfs_iflush(
3446	struct xfs_inode	*ip,
3447	struct xfs_buf		*bp)
3448{
3449	struct xfs_inode_log_item *iip = ip->i_itemp;
3450	struct xfs_dinode	*dip;
3451	struct xfs_mount	*mp = ip->i_mount;
3452	int			error;
3453
3454	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3455	ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3456	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3457	       ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3458	ASSERT(iip->ili_item.li_buf == bp);
3459
3460	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3461
3462	/*
3463	 * We don't flush the inode if any of the following checks fail, but we
3464	 * do still update the log item and attach to the backing buffer as if
3465	 * the flush happened. This is a formality to facilitate predictable
3466	 * error handling as the caller will shutdown and fail the buffer.
3467	 */
3468	error = -EFSCORRUPTED;
3469	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3470			       mp, XFS_ERRTAG_IFLUSH_1)) {
3471		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3472			"%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
3473			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3474		goto flush_out;
3475	}
3476	if (S_ISREG(VFS_I(ip)->i_mode)) {
3477		if (XFS_TEST_ERROR(
3478		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3479		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3480		    mp, XFS_ERRTAG_IFLUSH_3)) {
3481			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3482				"%s: Bad regular inode %Lu, ptr "PTR_FMT,
3483				__func__, ip->i_ino, ip);
3484			goto flush_out;
3485		}
3486	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3487		if (XFS_TEST_ERROR(
3488		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3489		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3490		    ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3491		    mp, XFS_ERRTAG_IFLUSH_4)) {
3492			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3493				"%s: Bad directory inode %Lu, ptr "PTR_FMT,
3494				__func__, ip->i_ino, ip);
3495			goto flush_out;
3496		}
3497	}
3498	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
3499				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3500		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3501			"%s: detected corrupt incore inode %Lu, "
3502			"total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
3503			__func__, ip->i_ino,
3504			ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
3505			ip->i_d.di_nblocks, ip);
3506		goto flush_out;
3507	}
3508	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3509				mp, XFS_ERRTAG_IFLUSH_6)) {
3510		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3511			"%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
3512			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3513		goto flush_out;
3514	}
3515
3516	/*
3517	 * Inode item log recovery for v2 inodes are dependent on the
3518	 * di_flushiter count for correct sequencing. We bump the flush
3519	 * iteration count so we can detect flushes which postdate a log record
3520	 * during recovery. This is redundant as we now log every change and
3521	 * hence this can't happen but we need to still do it to ensure
3522	 * backwards compatibility with old kernels that predate logging all
3523	 * inode changes.
3524	 */
3525	if (!xfs_sb_version_has_v3inode(&mp->m_sb))
3526		ip->i_d.di_flushiter++;
3527
3528	/*
3529	 * If there are inline format data / attr forks attached to this inode,
3530	 * make sure they are not corrupt.
3531	 */
3532	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3533	    xfs_ifork_verify_local_data(ip))
3534		goto flush_out;
3535	if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
3536	    xfs_ifork_verify_local_attr(ip))
3537		goto flush_out;
3538
3539	/*
3540	 * Copy the dirty parts of the inode into the on-disk inode.  We always
3541	 * copy out the core of the inode, because if the inode is dirty at all
3542	 * the core must be.
3543	 */
3544	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3545
3546	/* Wrap, we never let the log put out DI_MAX_FLUSH */
3547	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3548		ip->i_d.di_flushiter = 0;
3549
3550	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3551	if (XFS_IFORK_Q(ip))
3552		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3553
3554	/*
3555	 * We've recorded everything logged in the inode, so we'd like to clear
3556	 * the ili_fields bits so we don't log and flush things unnecessarily.
3557	 * However, we can't stop logging all this information until the data
3558	 * we've copied into the disk buffer is written to disk.  If we did we
3559	 * might overwrite the copy of the inode in the log with all the data
3560	 * after re-logging only part of it, and in the face of a crash we
3561	 * wouldn't have all the data we need to recover.
3562	 *
3563	 * What we do is move the bits to the ili_last_fields field.  When
3564	 * logging the inode, these bits are moved back to the ili_fields field.
3565	 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3566	 * we know that the information those bits represent is permanently on
3567	 * disk.  As long as the flush completes before the inode is logged
3568	 * again, then both ili_fields and ili_last_fields will be cleared.
3569	 */
3570	error = 0;
3571flush_out:
3572	spin_lock(&iip->ili_lock);
3573	iip->ili_last_fields = iip->ili_fields;
3574	iip->ili_fields = 0;
3575	iip->ili_fsync_fields = 0;
3576	spin_unlock(&iip->ili_lock);
3577
3578	/*
3579	 * Store the current LSN of the inode so that we can tell whether the
3580	 * item has moved in the AIL from xfs_buf_inode_iodone().
3581	 */
3582	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3583				&iip->ili_item.li_lsn);
3584
3585	/* generate the checksum. */
3586	xfs_dinode_calc_crc(mp, dip);
3587	return error;
3588}
3589
3590/*
3591 * Non-blocking flush of dirty inode metadata into the backing buffer.
3592 *
3593 * The caller must have a reference to the inode and hold the cluster buffer
3594 * locked. The function will walk across all the inodes on the cluster buffer it
3595 * can find and lock without blocking, and flush them to the cluster buffer.
3596 *
3597 * On successful flushing of at least one inode, the caller must write out the
3598 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3599 * the caller needs to release the buffer. On failure, the filesystem will be
3600 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3601 * will be returned.
3602 */
3603int
3604xfs_iflush_cluster(
3605	struct xfs_buf		*bp)
3606{
3607	struct xfs_mount	*mp = bp->b_mount;
3608	struct xfs_log_item	*lip, *n;
3609	struct xfs_inode	*ip;
3610	struct xfs_inode_log_item *iip;
3611	int			clcount = 0;
3612	int			error = 0;
3613
3614	/*
3615	 * We must use the safe variant here as on shutdown xfs_iflush_abort()
3616	 * can remove itself from the list.
3617	 */
3618	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3619		iip = (struct xfs_inode_log_item *)lip;
3620		ip = iip->ili_inode;
3621
3622		/*
3623		 * Quick and dirty check to avoid locks if possible.
3624		 */
3625		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3626			continue;
3627		if (xfs_ipincount(ip))
3628			continue;
3629
3630		/*
3631		 * The inode is still attached to the buffer, which means it is
3632		 * dirty but reclaim might try to grab it. Check carefully for
3633		 * that, and grab the ilock while still holding the i_flags_lock
3634		 * to guarantee reclaim will not be able to reclaim this inode
3635		 * once we drop the i_flags_lock.
3636		 */
3637		spin_lock(&ip->i_flags_lock);
3638		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3639		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3640			spin_unlock(&ip->i_flags_lock);
3641			continue;
3642		}
3643
3644		/*
3645		 * ILOCK will pin the inode against reclaim and prevent
3646		 * concurrent transactions modifying the inode while we are
3647		 * flushing the inode. If we get the lock, set the flushing
3648		 * state before we drop the i_flags_lock.
3649		 */
3650		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3651			spin_unlock(&ip->i_flags_lock);
3652			continue;
3653		}
3654		__xfs_iflags_set(ip, XFS_IFLUSHING);
3655		spin_unlock(&ip->i_flags_lock);
3656
3657		/*
3658		 * Abort flushing this inode if we are shut down because the
3659		 * inode may not currently be in the AIL. This can occur when
3660		 * log I/O failure unpins the inode without inserting into the
3661		 * AIL, leaving a dirty/unpinned inode attached to the buffer
3662		 * that otherwise looks like it should be flushed.
3663		 */
3664		if (XFS_FORCED_SHUTDOWN(mp)) {
3665			xfs_iunpin_wait(ip);
3666			xfs_iflush_abort(ip);
3667			xfs_iunlock(ip, XFS_ILOCK_SHARED);
3668			error = -EIO;
3669			continue;
3670		}
3671
3672		/* don't block waiting on a log force to unpin dirty inodes */
3673		if (xfs_ipincount(ip)) {
3674			xfs_iflags_clear(ip, XFS_IFLUSHING);
3675			xfs_iunlock(ip, XFS_ILOCK_SHARED);
3676			continue;
3677		}
3678
3679		if (!xfs_inode_clean(ip))
3680			error = xfs_iflush(ip, bp);
3681		else
3682			xfs_iflags_clear(ip, XFS_IFLUSHING);
3683		xfs_iunlock(ip, XFS_ILOCK_SHARED);
3684		if (error)
3685			break;
3686		clcount++;
3687	}
3688
3689	if (error) {
3690		bp->b_flags |= XBF_ASYNC;
3691		xfs_buf_ioend_fail(bp);
3692		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3693		return error;
3694	}
3695
3696	if (!clcount)
3697		return -EAGAIN;
3698
3699	XFS_STATS_INC(mp, xs_icluster_flushcnt);
3700	XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3701	return 0;
3702
3703}
3704
3705/* Release an inode. */
3706void
3707xfs_irele(
3708	struct xfs_inode	*ip)
3709{
3710	trace_xfs_irele(ip, _RET_IP_);
3711	iput(VFS_I(ip));
3712}
3713
3714/*
3715 * Ensure all commited transactions touching the inode are written to the log.
3716 */
3717int
3718xfs_log_force_inode(
3719	struct xfs_inode	*ip)
3720{
3721	xfs_csn_t		seq = 0;
3722
3723	xfs_ilock(ip, XFS_ILOCK_SHARED);
3724	if (xfs_ipincount(ip))
3725		seq = ip->i_itemp->ili_commit_seq;
3726	xfs_iunlock(ip, XFS_ILOCK_SHARED);
3727
3728	if (!seq)
3729		return 0;
3730	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3731}
3732
3733/*
3734 * Grab the exclusive iolock for a data copy from src to dest, making sure to
3735 * abide vfs locking order (lowest pointer value goes first) and breaking the
3736 * layout leases before proceeding.  The loop is needed because we cannot call
3737 * the blocking break_layout() with the iolocks held, and therefore have to
3738 * back out both locks.
3739 */
3740static int
3741xfs_iolock_two_inodes_and_break_layout(
3742	struct inode		*src,
3743	struct inode		*dest)
3744{
3745	int			error;
3746
3747	if (src > dest)
3748		swap(src, dest);
3749
3750retry:
3751	/* Wait to break both inodes' layouts before we start locking. */
3752	error = break_layout(src, true);
3753	if (error)
3754		return error;
3755	if (src != dest) {
3756		error = break_layout(dest, true);
3757		if (error)
3758			return error;
3759	}
3760
3761	/* Lock one inode and make sure nobody got in and leased it. */
3762	inode_lock(src);
3763	error = break_layout(src, false);
3764	if (error) {
3765		inode_unlock(src);
3766		if (error == -EWOULDBLOCK)
3767			goto retry;
3768		return error;
3769	}
3770
3771	if (src == dest)
3772		return 0;
3773
3774	/* Lock the other inode and make sure nobody got in and leased it. */
3775	inode_lock_nested(dest, I_MUTEX_NONDIR2);
3776	error = break_layout(dest, false);
3777	if (error) {
3778		inode_unlock(src);
3779		inode_unlock(dest);
3780		if (error == -EWOULDBLOCK)
3781			goto retry;
3782		return error;
3783	}
3784
3785	return 0;
3786}
3787
3788/*
3789 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3790 * mmap activity.
3791 */
3792int
3793xfs_ilock2_io_mmap(
3794	struct xfs_inode	*ip1,
3795	struct xfs_inode	*ip2)
3796{
3797	int			ret;
3798
3799	ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3800	if (ret)
3801		return ret;
3802	if (ip1 == ip2)
3803		xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3804	else
3805		xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
3806				    ip2, XFS_MMAPLOCK_EXCL);
3807	return 0;
3808}
3809
3810/* Unlock both inodes to allow IO and mmap activity. */
3811void
3812xfs_iunlock2_io_mmap(
3813	struct xfs_inode	*ip1,
3814	struct xfs_inode	*ip2)
3815{
3816	bool			same_inode = (ip1 == ip2);
3817
3818	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3819	if (!same_inode)
3820		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3821	inode_unlock(VFS_I(ip2));
3822	if (!same_inode)
3823		inode_unlock(VFS_I(ip1));
3824}
3825