xref: /kernel/linux/linux-5.10/fs/xfs/xfs_buf_item.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
13#include "xfs_mount.h"
14#include "xfs_trans.h"
15#include "xfs_trans_priv.h"
16#include "xfs_buf_item.h"
17#include "xfs_inode.h"
18#include "xfs_inode_item.h"
19#include "xfs_quota.h"
20#include "xfs_dquot_item.h"
21#include "xfs_dquot.h"
22#include "xfs_trace.h"
23#include "xfs_log.h"
24
25
26kmem_zone_t	*xfs_buf_item_zone;
27
28static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
29{
30	return container_of(lip, struct xfs_buf_log_item, bli_item);
31}
32
33/* Is this log iovec plausibly large enough to contain the buffer log format? */
34bool
35xfs_buf_log_check_iovec(
36	struct xfs_log_iovec		*iovec)
37{
38	struct xfs_buf_log_format	*blfp = iovec->i_addr;
39	char				*bmp_end;
40	char				*item_end;
41
42	if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
43		return false;
44
45	item_end = (char *)iovec->i_addr + iovec->i_len;
46	bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
47	return bmp_end <= item_end;
48}
49
50static inline int
51xfs_buf_log_format_size(
52	struct xfs_buf_log_format *blfp)
53{
54	return offsetof(struct xfs_buf_log_format, blf_data_map) +
55			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
56}
57
58/*
59 * Return the number of log iovecs and space needed to log the given buf log
60 * item segment.
61 *
62 * It calculates this as 1 iovec for the buf log format structure and 1 for each
63 * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
64 * in a single iovec.
65 */
66STATIC void
67xfs_buf_item_size_segment(
68	struct xfs_buf_log_item		*bip,
69	struct xfs_buf_log_format	*blfp,
70	int				*nvecs,
71	int				*nbytes)
72{
73	struct xfs_buf			*bp = bip->bli_buf;
74	int				next_bit;
75	int				last_bit;
76
77	last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
78	if (last_bit == -1)
79		return;
80
81	/*
82	 * initial count for a dirty buffer is 2 vectors - the format structure
83	 * and the first dirty region.
84	 */
85	*nvecs += 2;
86	*nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
87
88	while (last_bit != -1) {
89		/*
90		 * This takes the bit number to start looking from and
91		 * returns the next set bit from there.  It returns -1
92		 * if there are no more bits set or the start bit is
93		 * beyond the end of the bitmap.
94		 */
95		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
96					last_bit + 1);
97		/*
98		 * If we run out of bits, leave the loop,
99		 * else if we find a new set of bits bump the number of vecs,
100		 * else keep scanning the current set of bits.
101		 */
102		if (next_bit == -1) {
103			break;
104		} else if (next_bit != last_bit + 1) {
105			last_bit = next_bit;
106			(*nvecs)++;
107		} else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
108			   (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
109			    XFS_BLF_CHUNK)) {
110			last_bit = next_bit;
111			(*nvecs)++;
112		} else {
113			last_bit++;
114		}
115		*nbytes += XFS_BLF_CHUNK;
116	}
117}
118
119/*
120 * Return the number of log iovecs and space needed to log the given buf log
121 * item.
122 *
123 * Discontiguous buffers need a format structure per region that is being
124 * logged. This makes the changes in the buffer appear to log recovery as though
125 * they came from separate buffers, just like would occur if multiple buffers
126 * were used instead of a single discontiguous buffer. This enables
127 * discontiguous buffers to be in-memory constructs, completely transparent to
128 * what ends up on disk.
129 *
130 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
131 * format structures. If the item has previously been logged and has dirty
132 * regions, we do not relog them in stale buffers. This has the effect of
133 * reducing the size of the relogged item by the amount of dirty data tracked
134 * by the log item. This can result in the committing transaction reducing the
135 * amount of space being consumed by the CIL.
136 */
137STATIC void
138xfs_buf_item_size(
139	struct xfs_log_item	*lip,
140	int			*nvecs,
141	int			*nbytes)
142{
143	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
144	int			i;
145
146	ASSERT(atomic_read(&bip->bli_refcount) > 0);
147	if (bip->bli_flags & XFS_BLI_STALE) {
148		/*
149		 * The buffer is stale, so all we need to log is the buf log
150		 * format structure with the cancel flag in it as we are never
151		 * going to replay the changes tracked in the log item.
152		 */
153		trace_xfs_buf_item_size_stale(bip);
154		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
155		*nvecs += bip->bli_format_count;
156		for (i = 0; i < bip->bli_format_count; i++) {
157			*nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
158		}
159		return;
160	}
161
162	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
163
164	if (bip->bli_flags & XFS_BLI_ORDERED) {
165		/*
166		 * The buffer has been logged just to order it. It is not being
167		 * included in the transaction commit, so no vectors are used at
168		 * all.
169		 */
170		trace_xfs_buf_item_size_ordered(bip);
171		*nvecs = XFS_LOG_VEC_ORDERED;
172		return;
173	}
174
175	/*
176	 * the vector count is based on the number of buffer vectors we have
177	 * dirty bits in. This will only be greater than one when we have a
178	 * compound buffer with more than one segment dirty. Hence for compound
179	 * buffers we need to track which segment the dirty bits correspond to,
180	 * and when we move from one segment to the next increment the vector
181	 * count for the extra buf log format structure that will need to be
182	 * written.
183	 */
184	for (i = 0; i < bip->bli_format_count; i++) {
185		xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
186					  nvecs, nbytes);
187	}
188	trace_xfs_buf_item_size(bip);
189}
190
191static inline void
192xfs_buf_item_copy_iovec(
193	struct xfs_log_vec	*lv,
194	struct xfs_log_iovec	**vecp,
195	struct xfs_buf		*bp,
196	uint			offset,
197	int			first_bit,
198	uint			nbits)
199{
200	offset += first_bit * XFS_BLF_CHUNK;
201	xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
202			xfs_buf_offset(bp, offset),
203			nbits * XFS_BLF_CHUNK);
204}
205
206static inline bool
207xfs_buf_item_straddle(
208	struct xfs_buf		*bp,
209	uint			offset,
210	int			next_bit,
211	int			last_bit)
212{
213	return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
214		(xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
215		 XFS_BLF_CHUNK);
216}
217
218static void
219xfs_buf_item_format_segment(
220	struct xfs_buf_log_item	*bip,
221	struct xfs_log_vec	*lv,
222	struct xfs_log_iovec	**vecp,
223	uint			offset,
224	struct xfs_buf_log_format *blfp)
225{
226	struct xfs_buf		*bp = bip->bli_buf;
227	uint			base_size;
228	int			first_bit;
229	int			last_bit;
230	int			next_bit;
231	uint			nbits;
232
233	/* copy the flags across from the base format item */
234	blfp->blf_flags = bip->__bli_format.blf_flags;
235
236	/*
237	 * Base size is the actual size of the ondisk structure - it reflects
238	 * the actual size of the dirty bitmap rather than the size of the in
239	 * memory structure.
240	 */
241	base_size = xfs_buf_log_format_size(blfp);
242
243	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
244	if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
245		/*
246		 * If the map is not be dirty in the transaction, mark
247		 * the size as zero and do not advance the vector pointer.
248		 */
249		return;
250	}
251
252	blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
253	blfp->blf_size = 1;
254
255	if (bip->bli_flags & XFS_BLI_STALE) {
256		/*
257		 * The buffer is stale, so all we need to log
258		 * is the buf log format structure with the
259		 * cancel flag in it.
260		 */
261		trace_xfs_buf_item_format_stale(bip);
262		ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
263		return;
264	}
265
266
267	/*
268	 * Fill in an iovec for each set of contiguous chunks.
269	 */
270	last_bit = first_bit;
271	nbits = 1;
272	for (;;) {
273		/*
274		 * This takes the bit number to start looking from and
275		 * returns the next set bit from there.  It returns -1
276		 * if there are no more bits set or the start bit is
277		 * beyond the end of the bitmap.
278		 */
279		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
280					(uint)last_bit + 1);
281		/*
282		 * If we run out of bits fill in the last iovec and get out of
283		 * the loop.  Else if we start a new set of bits then fill in
284		 * the iovec for the series we were looking at and start
285		 * counting the bits in the new one.  Else we're still in the
286		 * same set of bits so just keep counting and scanning.
287		 */
288		if (next_bit == -1) {
289			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
290						first_bit, nbits);
291			blfp->blf_size++;
292			break;
293		} else if (next_bit != last_bit + 1 ||
294		           xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
295			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
296						first_bit, nbits);
297			blfp->blf_size++;
298			first_bit = next_bit;
299			last_bit = next_bit;
300			nbits = 1;
301		} else {
302			last_bit++;
303			nbits++;
304		}
305	}
306}
307
308/*
309 * This is called to fill in the vector of log iovecs for the
310 * given log buf item.  It fills the first entry with a buf log
311 * format structure, and the rest point to contiguous chunks
312 * within the buffer.
313 */
314STATIC void
315xfs_buf_item_format(
316	struct xfs_log_item	*lip,
317	struct xfs_log_vec	*lv)
318{
319	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
320	struct xfs_buf		*bp = bip->bli_buf;
321	struct xfs_log_iovec	*vecp = NULL;
322	uint			offset = 0;
323	int			i;
324
325	ASSERT(atomic_read(&bip->bli_refcount) > 0);
326	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
327	       (bip->bli_flags & XFS_BLI_STALE));
328	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
329	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
330	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
331	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
332	       (bip->bli_flags & XFS_BLI_STALE));
333
334
335	/*
336	 * If it is an inode buffer, transfer the in-memory state to the
337	 * format flags and clear the in-memory state.
338	 *
339	 * For buffer based inode allocation, we do not transfer
340	 * this state if the inode buffer allocation has not yet been committed
341	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
342	 * correct replay of the inode allocation.
343	 *
344	 * For icreate item based inode allocation, the buffers aren't written
345	 * to the journal during allocation, and hence we should always tag the
346	 * buffer as an inode buffer so that the correct unlinked list replay
347	 * occurs during recovery.
348	 */
349	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
350		if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) ||
351		    !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
352		      xfs_log_item_in_current_chkpt(lip)))
353			bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
354		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
355	}
356
357	for (i = 0; i < bip->bli_format_count; i++) {
358		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
359					    &bip->bli_formats[i]);
360		offset += BBTOB(bp->b_maps[i].bm_len);
361	}
362
363	/*
364	 * Check to make sure everything is consistent.
365	 */
366	trace_xfs_buf_item_format(bip);
367}
368
369/*
370 * This is called to pin the buffer associated with the buf log item in memory
371 * so it cannot be written out.
372 *
373 * We also always take a reference to the buffer log item here so that the bli
374 * is held while the item is pinned in memory. This means that we can
375 * unconditionally drop the reference count a transaction holds when the
376 * transaction is completed.
377 */
378STATIC void
379xfs_buf_item_pin(
380	struct xfs_log_item	*lip)
381{
382	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
383
384	ASSERT(atomic_read(&bip->bli_refcount) > 0);
385	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
386	       (bip->bli_flags & XFS_BLI_ORDERED) ||
387	       (bip->bli_flags & XFS_BLI_STALE));
388
389	trace_xfs_buf_item_pin(bip);
390
391	atomic_inc(&bip->bli_refcount);
392	atomic_inc(&bip->bli_buf->b_pin_count);
393}
394
395/*
396 * This is called to unpin the buffer associated with the buf log item which
397 * was previously pinned with a call to xfs_buf_item_pin().
398 */
399STATIC void
400xfs_buf_item_unpin(
401	struct xfs_log_item	*lip,
402	int			remove)
403{
404	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
405	xfs_buf_t		*bp = bip->bli_buf;
406	int			stale = bip->bli_flags & XFS_BLI_STALE;
407	int			freed;
408
409	ASSERT(bp->b_log_item == bip);
410	ASSERT(atomic_read(&bip->bli_refcount) > 0);
411
412	trace_xfs_buf_item_unpin(bip);
413
414	/*
415	 * Drop the bli ref associated with the pin and grab the hold required
416	 * for the I/O simulation failure in the abort case. We have to do this
417	 * before the pin count drops because the AIL doesn't acquire a bli
418	 * reference. Therefore if the refcount drops to zero, the bli could
419	 * still be AIL resident and the buffer submitted for I/O (and freed on
420	 * completion) at any point before we return. This can be removed once
421	 * the AIL properly holds a reference on the bli.
422	 */
423	freed = atomic_dec_and_test(&bip->bli_refcount);
424	if (freed && !stale && remove)
425		xfs_buf_hold(bp);
426	if (atomic_dec_and_test(&bp->b_pin_count))
427		wake_up_all(&bp->b_waiters);
428
429	 /* nothing to do but drop the pin count if the bli is active */
430	if (!freed)
431		return;
432
433	if (stale) {
434		ASSERT(bip->bli_flags & XFS_BLI_STALE);
435		ASSERT(xfs_buf_islocked(bp));
436		ASSERT(bp->b_flags & XBF_STALE);
437		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
438		ASSERT(list_empty(&lip->li_trans));
439		ASSERT(!bp->b_transp);
440
441		trace_xfs_buf_item_unpin_stale(bip);
442
443		/*
444		 * If we get called here because of an IO error, we may or may
445		 * not have the item on the AIL. xfs_trans_ail_delete() will
446		 * take care of that situation. xfs_trans_ail_delete() drops
447		 * the AIL lock.
448		 */
449		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
450			xfs_buf_item_done(bp);
451			xfs_buf_inode_iodone(bp);
452			ASSERT(list_empty(&bp->b_li_list));
453		} else {
454			xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
455			xfs_buf_item_relse(bp);
456			ASSERT(bp->b_log_item == NULL);
457		}
458		xfs_buf_relse(bp);
459	} else if (remove) {
460		/*
461		 * The buffer must be locked and held by the caller to simulate
462		 * an async I/O failure. We acquired the hold for this case
463		 * before the buffer was unpinned.
464		 */
465		xfs_buf_lock(bp);
466		bp->b_flags |= XBF_ASYNC;
467		xfs_buf_ioend_fail(bp);
468	}
469}
470
471STATIC uint
472xfs_buf_item_push(
473	struct xfs_log_item	*lip,
474	struct list_head	*buffer_list)
475{
476	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
477	struct xfs_buf		*bp = bip->bli_buf;
478	uint			rval = XFS_ITEM_SUCCESS;
479
480	if (xfs_buf_ispinned(bp))
481		return XFS_ITEM_PINNED;
482	if (!xfs_buf_trylock(bp)) {
483		/*
484		 * If we have just raced with a buffer being pinned and it has
485		 * been marked stale, we could end up stalling until someone else
486		 * issues a log force to unpin the stale buffer. Check for the
487		 * race condition here so xfsaild recognizes the buffer is pinned
488		 * and queues a log force to move it along.
489		 */
490		if (xfs_buf_ispinned(bp))
491			return XFS_ITEM_PINNED;
492		return XFS_ITEM_LOCKED;
493	}
494
495	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
496
497	trace_xfs_buf_item_push(bip);
498
499	/* has a previous flush failed due to IO errors? */
500	if (bp->b_flags & XBF_WRITE_FAIL) {
501		xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
502	    "Failing async write on buffer block 0x%llx. Retrying async write.",
503					  (long long)bp->b_bn);
504	}
505
506	if (!xfs_buf_delwri_queue(bp, buffer_list))
507		rval = XFS_ITEM_FLUSHING;
508	xfs_buf_unlock(bp);
509	return rval;
510}
511
512/*
513 * Drop the buffer log item refcount and take appropriate action. This helper
514 * determines whether the bli must be freed or not, since a decrement to zero
515 * does not necessarily mean the bli is unused.
516 *
517 * Return true if the bli is freed, false otherwise.
518 */
519bool
520xfs_buf_item_put(
521	struct xfs_buf_log_item	*bip)
522{
523	struct xfs_log_item	*lip = &bip->bli_item;
524	bool			aborted;
525	bool			dirty;
526
527	/* drop the bli ref and return if it wasn't the last one */
528	if (!atomic_dec_and_test(&bip->bli_refcount))
529		return false;
530
531	/*
532	 * We dropped the last ref and must free the item if clean or aborted.
533	 * If the bli is dirty and non-aborted, the buffer was clean in the
534	 * transaction but still awaiting writeback from previous changes. In
535	 * that case, the bli is freed on buffer writeback completion.
536	 */
537	aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
538		  XFS_FORCED_SHUTDOWN(lip->li_mountp);
539	dirty = bip->bli_flags & XFS_BLI_DIRTY;
540	if (dirty && !aborted)
541		return false;
542
543	/*
544	 * The bli is aborted or clean. An aborted item may be in the AIL
545	 * regardless of dirty state.  For example, consider an aborted
546	 * transaction that invalidated a dirty bli and cleared the dirty
547	 * state.
548	 */
549	if (aborted)
550		xfs_trans_ail_delete(lip, 0);
551	xfs_buf_item_relse(bip->bli_buf);
552	return true;
553}
554
555/*
556 * Release the buffer associated with the buf log item.  If there is no dirty
557 * logged data associated with the buffer recorded in the buf log item, then
558 * free the buf log item and remove the reference to it in the buffer.
559 *
560 * This call ignores the recursion count.  It is only called when the buffer
561 * should REALLY be unlocked, regardless of the recursion count.
562 *
563 * We unconditionally drop the transaction's reference to the log item. If the
564 * item was logged, then another reference was taken when it was pinned, so we
565 * can safely drop the transaction reference now.  This also allows us to avoid
566 * potential races with the unpin code freeing the bli by not referencing the
567 * bli after we've dropped the reference count.
568 *
569 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
570 * if necessary but do not unlock the buffer.  This is for support of
571 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
572 * free the item.
573 */
574STATIC void
575xfs_buf_item_release(
576	struct xfs_log_item	*lip)
577{
578	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
579	struct xfs_buf		*bp = bip->bli_buf;
580	bool			released;
581	bool			hold = bip->bli_flags & XFS_BLI_HOLD;
582	bool			stale = bip->bli_flags & XFS_BLI_STALE;
583#if defined(DEBUG) || defined(XFS_WARN)
584	bool			ordered = bip->bli_flags & XFS_BLI_ORDERED;
585	bool			dirty = bip->bli_flags & XFS_BLI_DIRTY;
586	bool			aborted = test_bit(XFS_LI_ABORTED,
587						   &lip->li_flags);
588#endif
589
590	trace_xfs_buf_item_release(bip);
591
592	/*
593	 * The bli dirty state should match whether the blf has logged segments
594	 * except for ordered buffers, where only the bli should be dirty.
595	 */
596	ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
597	       (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
598	ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
599
600	/*
601	 * Clear the buffer's association with this transaction and
602	 * per-transaction state from the bli, which has been copied above.
603	 */
604	bp->b_transp = NULL;
605	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
606
607	/*
608	 * Unref the item and unlock the buffer unless held or stale. Stale
609	 * buffers remain locked until final unpin unless the bli is freed by
610	 * the unref call. The latter implies shutdown because buffer
611	 * invalidation dirties the bli and transaction.
612	 */
613	released = xfs_buf_item_put(bip);
614	if (hold || (stale && !released))
615		return;
616	ASSERT(!stale || aborted);
617	xfs_buf_relse(bp);
618}
619
620STATIC void
621xfs_buf_item_committing(
622	struct xfs_log_item	*lip,
623	xfs_csn_t		seq)
624{
625	return xfs_buf_item_release(lip);
626}
627
628/*
629 * This is called to find out where the oldest active copy of the
630 * buf log item in the on disk log resides now that the last log
631 * write of it completed at the given lsn.
632 * We always re-log all the dirty data in a buffer, so usually the
633 * latest copy in the on disk log is the only one that matters.  For
634 * those cases we simply return the given lsn.
635 *
636 * The one exception to this is for buffers full of newly allocated
637 * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
638 * flag set, indicating that only the di_next_unlinked fields from the
639 * inodes in the buffers will be replayed during recovery.  If the
640 * original newly allocated inode images have not yet been flushed
641 * when the buffer is so relogged, then we need to make sure that we
642 * keep the old images in the 'active' portion of the log.  We do this
643 * by returning the original lsn of that transaction here rather than
644 * the current one.
645 */
646STATIC xfs_lsn_t
647xfs_buf_item_committed(
648	struct xfs_log_item	*lip,
649	xfs_lsn_t		lsn)
650{
651	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
652
653	trace_xfs_buf_item_committed(bip);
654
655	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
656		return lip->li_lsn;
657	return lsn;
658}
659
660static const struct xfs_item_ops xfs_buf_item_ops = {
661	.iop_size	= xfs_buf_item_size,
662	.iop_format	= xfs_buf_item_format,
663	.iop_pin	= xfs_buf_item_pin,
664	.iop_unpin	= xfs_buf_item_unpin,
665	.iop_release	= xfs_buf_item_release,
666	.iop_committing	= xfs_buf_item_committing,
667	.iop_committed	= xfs_buf_item_committed,
668	.iop_push	= xfs_buf_item_push,
669};
670
671STATIC void
672xfs_buf_item_get_format(
673	struct xfs_buf_log_item	*bip,
674	int			count)
675{
676	ASSERT(bip->bli_formats == NULL);
677	bip->bli_format_count = count;
678
679	if (count == 1) {
680		bip->bli_formats = &bip->__bli_format;
681		return;
682	}
683
684	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
685				0);
686}
687
688STATIC void
689xfs_buf_item_free_format(
690	struct xfs_buf_log_item	*bip)
691{
692	if (bip->bli_formats != &bip->__bli_format) {
693		kmem_free(bip->bli_formats);
694		bip->bli_formats = NULL;
695	}
696}
697
698/*
699 * Allocate a new buf log item to go with the given buffer.
700 * Set the buffer's b_log_item field to point to the new
701 * buf log item.
702 */
703int
704xfs_buf_item_init(
705	struct xfs_buf	*bp,
706	struct xfs_mount *mp)
707{
708	struct xfs_buf_log_item	*bip = bp->b_log_item;
709	int			chunks;
710	int			map_size;
711	int			i;
712
713	/*
714	 * Check to see if there is already a buf log item for
715	 * this buffer. If we do already have one, there is
716	 * nothing to do here so return.
717	 */
718	ASSERT(bp->b_mount == mp);
719	if (bip) {
720		ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
721		ASSERT(!bp->b_transp);
722		ASSERT(bip->bli_buf == bp);
723		return 0;
724	}
725
726	bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL);
727	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
728	bip->bli_buf = bp;
729
730	/*
731	 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
732	 * can be divided into. Make sure not to truncate any pieces.
733	 * map_size is the size of the bitmap needed to describe the
734	 * chunks of the buffer.
735	 *
736	 * Discontiguous buffer support follows the layout of the underlying
737	 * buffer. This makes the implementation as simple as possible.
738	 */
739	xfs_buf_item_get_format(bip, bp->b_map_count);
740
741	for (i = 0; i < bip->bli_format_count; i++) {
742		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
743				      XFS_BLF_CHUNK);
744		map_size = DIV_ROUND_UP(chunks, NBWORD);
745
746		if (map_size > XFS_BLF_DATAMAP_SIZE) {
747			kmem_cache_free(xfs_buf_item_zone, bip);
748			xfs_err(mp,
749	"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
750					map_size,
751					BBTOB(bp->b_maps[i].bm_len));
752			return -EFSCORRUPTED;
753		}
754
755		bip->bli_formats[i].blf_type = XFS_LI_BUF;
756		bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
757		bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
758		bip->bli_formats[i].blf_map_size = map_size;
759	}
760
761	bp->b_log_item = bip;
762	xfs_buf_hold(bp);
763	return 0;
764}
765
766
767/*
768 * Mark bytes first through last inclusive as dirty in the buf
769 * item's bitmap.
770 */
771static void
772xfs_buf_item_log_segment(
773	uint			first,
774	uint			last,
775	uint			*map)
776{
777	uint		first_bit;
778	uint		last_bit;
779	uint		bits_to_set;
780	uint		bits_set;
781	uint		word_num;
782	uint		*wordp;
783	uint		bit;
784	uint		end_bit;
785	uint		mask;
786
787	ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
788	ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
789
790	/*
791	 * Convert byte offsets to bit numbers.
792	 */
793	first_bit = first >> XFS_BLF_SHIFT;
794	last_bit = last >> XFS_BLF_SHIFT;
795
796	/*
797	 * Calculate the total number of bits to be set.
798	 */
799	bits_to_set = last_bit - first_bit + 1;
800
801	/*
802	 * Get a pointer to the first word in the bitmap
803	 * to set a bit in.
804	 */
805	word_num = first_bit >> BIT_TO_WORD_SHIFT;
806	wordp = &map[word_num];
807
808	/*
809	 * Calculate the starting bit in the first word.
810	 */
811	bit = first_bit & (uint)(NBWORD - 1);
812
813	/*
814	 * First set any bits in the first word of our range.
815	 * If it starts at bit 0 of the word, it will be
816	 * set below rather than here.  That is what the variable
817	 * bit tells us. The variable bits_set tracks the number
818	 * of bits that have been set so far.  End_bit is the number
819	 * of the last bit to be set in this word plus one.
820	 */
821	if (bit) {
822		end_bit = min(bit + bits_to_set, (uint)NBWORD);
823		mask = ((1U << (end_bit - bit)) - 1) << bit;
824		*wordp |= mask;
825		wordp++;
826		bits_set = end_bit - bit;
827	} else {
828		bits_set = 0;
829	}
830
831	/*
832	 * Now set bits a whole word at a time that are between
833	 * first_bit and last_bit.
834	 */
835	while ((bits_to_set - bits_set) >= NBWORD) {
836		*wordp = 0xffffffff;
837		bits_set += NBWORD;
838		wordp++;
839	}
840
841	/*
842	 * Finally, set any bits left to be set in one last partial word.
843	 */
844	end_bit = bits_to_set - bits_set;
845	if (end_bit) {
846		mask = (1U << end_bit) - 1;
847		*wordp |= mask;
848	}
849}
850
851/*
852 * Mark bytes first through last inclusive as dirty in the buf
853 * item's bitmap.
854 */
855void
856xfs_buf_item_log(
857	struct xfs_buf_log_item	*bip,
858	uint			first,
859	uint			last)
860{
861	int			i;
862	uint			start;
863	uint			end;
864	struct xfs_buf		*bp = bip->bli_buf;
865
866	/*
867	 * walk each buffer segment and mark them dirty appropriately.
868	 */
869	start = 0;
870	for (i = 0; i < bip->bli_format_count; i++) {
871		if (start > last)
872			break;
873		end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
874
875		/* skip to the map that includes the first byte to log */
876		if (first > end) {
877			start += BBTOB(bp->b_maps[i].bm_len);
878			continue;
879		}
880
881		/*
882		 * Trim the range to this segment and mark it in the bitmap.
883		 * Note that we must convert buffer offsets to segment relative
884		 * offsets (e.g., the first byte of each segment is byte 0 of
885		 * that segment).
886		 */
887		if (first < start)
888			first = start;
889		if (end > last)
890			end = last;
891		xfs_buf_item_log_segment(first - start, end - start,
892					 &bip->bli_formats[i].blf_data_map[0]);
893
894		start += BBTOB(bp->b_maps[i].bm_len);
895	}
896}
897
898
899/*
900 * Return true if the buffer has any ranges logged/dirtied by a transaction,
901 * false otherwise.
902 */
903bool
904xfs_buf_item_dirty_format(
905	struct xfs_buf_log_item	*bip)
906{
907	int			i;
908
909	for (i = 0; i < bip->bli_format_count; i++) {
910		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
911			     bip->bli_formats[i].blf_map_size))
912			return true;
913	}
914
915	return false;
916}
917
918STATIC void
919xfs_buf_item_free(
920	struct xfs_buf_log_item	*bip)
921{
922	xfs_buf_item_free_format(bip);
923	kmem_free(bip->bli_item.li_lv_shadow);
924	kmem_cache_free(xfs_buf_item_zone, bip);
925}
926
927/*
928 * xfs_buf_item_relse() is called when the buf log item is no longer needed.
929 */
930void
931xfs_buf_item_relse(
932	xfs_buf_t	*bp)
933{
934	struct xfs_buf_log_item	*bip = bp->b_log_item;
935
936	trace_xfs_buf_item_relse(bp, _RET_IP_);
937	ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
938
939	bp->b_log_item = NULL;
940	xfs_buf_rele(bp);
941	xfs_buf_item_free(bip);
942}
943
944void
945xfs_buf_item_done(
946	struct xfs_buf		*bp)
947{
948	/*
949	 * If we are forcibly shutting down, this may well be off the AIL
950	 * already. That's because we simulate the log-committed callbacks to
951	 * unpin these buffers. Or we may never have put this item on AIL
952	 * because of the transaction was aborted forcibly.
953	 * xfs_trans_ail_delete() takes care of these.
954	 *
955	 * Either way, AIL is useless if we're forcing a shutdown.
956	 *
957	 * Note that log recovery writes might have buffer items that are not on
958	 * the AIL even when the file system is not shut down.
959	 */
960	xfs_trans_ail_delete(&bp->b_log_item->bli_item,
961			     (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
962			     SHUTDOWN_CORRUPT_INCORE);
963	xfs_buf_item_relse(bp);
964}
965