xref: /kernel/linux/linux-6.6/fs/jbd2/transaction.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * linux/fs/jbd2/transaction.c
4 *
5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 *
7 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 *
9 * Generic filesystem transaction handling code; part of the ext2fs
10 * journaling system.
11 *
12 * This file manages transactions (compound commits managed by the
13 * journaling code) and handles (individual atomic operations by the
14 * filesystem).
15 */
16
17#include <linux/time.h>
18#include <linux/fs.h>
19#include <linux/jbd2.h>
20#include <linux/errno.h>
21#include <linux/slab.h>
22#include <linux/timer.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/hrtimer.h>
26#include <linux/backing-dev.h>
27#include <linux/bug.h>
28#include <linux/module.h>
29#include <linux/sched/mm.h>
30
31#include <trace/events/jbd2.h>
32
33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35
36static struct kmem_cache *transaction_cache;
37int __init jbd2_journal_init_transaction_cache(void)
38{
39	J_ASSERT(!transaction_cache);
40	transaction_cache = kmem_cache_create("jbd2_transaction_s",
41					sizeof(transaction_t),
42					0,
43					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
44					NULL);
45	if (!transaction_cache) {
46		pr_emerg("JBD2: failed to create transaction cache\n");
47		return -ENOMEM;
48	}
49	return 0;
50}
51
52void jbd2_journal_destroy_transaction_cache(void)
53{
54	kmem_cache_destroy(transaction_cache);
55	transaction_cache = NULL;
56}
57
58void jbd2_journal_free_transaction(transaction_t *transaction)
59{
60	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61		return;
62	kmem_cache_free(transaction_cache, transaction);
63}
64
65/*
66 * Base amount of descriptor blocks we reserve for each transaction.
67 */
68static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
69{
70	int tag_space = journal->j_blocksize - sizeof(journal_header_t);
71	int tags_per_block;
72
73	/* Subtract UUID */
74	tag_space -= 16;
75	if (jbd2_journal_has_csum_v2or3(journal))
76		tag_space -= sizeof(struct jbd2_journal_block_tail);
77	/* Commit code leaves a slack space of 16 bytes at the end of block */
78	tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
79	/*
80	 * Revoke descriptors are accounted separately so we need to reserve
81	 * space for commit block and normal transaction descriptor blocks.
82	 */
83	return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
84				tags_per_block);
85}
86
87/*
88 * jbd2_get_transaction: obtain a new transaction_t object.
89 *
90 * Simply initialise a new transaction. Initialize it in
91 * RUNNING state and add it to the current journal (which should not
92 * have an existing running transaction: we only make a new transaction
93 * once we have started to commit the old one).
94 *
95 * Preconditions:
96 *	The journal MUST be locked.  We don't perform atomic mallocs on the
97 *	new transaction	and we can't block without protecting against other
98 *	processes trying to touch the journal while it is in transition.
99 *
100 */
101
102static void jbd2_get_transaction(journal_t *journal,
103				transaction_t *transaction)
104{
105	transaction->t_journal = journal;
106	transaction->t_state = T_RUNNING;
107	transaction->t_start_time = ktime_get();
108	transaction->t_tid = journal->j_transaction_sequence++;
109	transaction->t_expires = jiffies + journal->j_commit_interval;
110	atomic_set(&transaction->t_updates, 0);
111	atomic_set(&transaction->t_outstanding_credits,
112		   jbd2_descriptor_blocks_per_trans(journal) +
113		   atomic_read(&journal->j_reserved_credits));
114	atomic_set(&transaction->t_outstanding_revokes, 0);
115	atomic_set(&transaction->t_handle_count, 0);
116	INIT_LIST_HEAD(&transaction->t_inode_list);
117	INIT_LIST_HEAD(&transaction->t_private_list);
118
119	/* Set up the commit timer for the new transaction. */
120	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
121	add_timer(&journal->j_commit_timer);
122
123	J_ASSERT(journal->j_running_transaction == NULL);
124	journal->j_running_transaction = transaction;
125	transaction->t_max_wait = 0;
126	transaction->t_start = jiffies;
127	transaction->t_requested = 0;
128}
129
130/*
131 * Handle management.
132 *
133 * A handle_t is an object which represents a single atomic update to a
134 * filesystem, and which tracks all of the modifications which form part
135 * of that one update.
136 */
137
138/*
139 * Update transaction's maximum wait time, if debugging is enabled.
140 *
141 * t_max_wait is carefully updated here with use of atomic compare exchange.
142 * Note that there could be multiplre threads trying to do this simultaneously
143 * hence using cmpxchg to avoid any use of locks in this case.
144 * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
145 */
146static inline void update_t_max_wait(transaction_t *transaction,
147				     unsigned long ts)
148{
149	unsigned long oldts, newts;
150
151	if (time_after(transaction->t_start, ts)) {
152		newts = jbd2_time_diff(ts, transaction->t_start);
153		oldts = READ_ONCE(transaction->t_max_wait);
154		while (oldts < newts)
155			oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
156	}
157}
158
159/*
160 * Wait until running transaction passes to T_FLUSH state and new transaction
161 * can thus be started. Also starts the commit if needed. The function expects
162 * running transaction to exist and releases j_state_lock.
163 */
164static void wait_transaction_locked(journal_t *journal)
165	__releases(journal->j_state_lock)
166{
167	DEFINE_WAIT(wait);
168	int need_to_start;
169	tid_t tid = journal->j_running_transaction->t_tid;
170
171	prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
172			TASK_UNINTERRUPTIBLE);
173	need_to_start = !tid_geq(journal->j_commit_request, tid);
174	read_unlock(&journal->j_state_lock);
175	if (need_to_start)
176		jbd2_log_start_commit(journal, tid);
177	jbd2_might_wait_for_commit(journal);
178	schedule();
179	finish_wait(&journal->j_wait_transaction_locked, &wait);
180}
181
182/*
183 * Wait until running transaction transitions from T_SWITCH to T_FLUSH
184 * state and new transaction can thus be started. The function releases
185 * j_state_lock.
186 */
187static void wait_transaction_switching(journal_t *journal)
188	__releases(journal->j_state_lock)
189{
190	DEFINE_WAIT(wait);
191
192	if (WARN_ON(!journal->j_running_transaction ||
193		    journal->j_running_transaction->t_state != T_SWITCH)) {
194		read_unlock(&journal->j_state_lock);
195		return;
196	}
197	prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
198			TASK_UNINTERRUPTIBLE);
199	read_unlock(&journal->j_state_lock);
200	/*
201	 * We don't call jbd2_might_wait_for_commit() here as there's no
202	 * waiting for outstanding handles happening anymore in T_SWITCH state
203	 * and handling of reserved handles actually relies on that for
204	 * correctness.
205	 */
206	schedule();
207	finish_wait(&journal->j_wait_transaction_locked, &wait);
208}
209
210static void sub_reserved_credits(journal_t *journal, int blocks)
211{
212	atomic_sub(blocks, &journal->j_reserved_credits);
213	wake_up(&journal->j_wait_reserved);
214}
215
216/*
217 * Wait until we can add credits for handle to the running transaction.  Called
218 * with j_state_lock held for reading. Returns 0 if handle joined the running
219 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
220 * caller must retry.
221 *
222 * Note: because j_state_lock may be dropped depending on the return
223 * value, we need to fake out sparse so ti doesn't complain about a
224 * locking imbalance.  Callers of add_transaction_credits will need to
225 * make a similar accomodation.
226 */
227static int add_transaction_credits(journal_t *journal, int blocks,
228				   int rsv_blocks)
229__must_hold(&journal->j_state_lock)
230{
231	transaction_t *t = journal->j_running_transaction;
232	int needed;
233	int total = blocks + rsv_blocks;
234
235	/*
236	 * If the current transaction is locked down for commit, wait
237	 * for the lock to be released.
238	 */
239	if (t->t_state != T_RUNNING) {
240		WARN_ON_ONCE(t->t_state >= T_FLUSH);
241		wait_transaction_locked(journal);
242		__acquire(&journal->j_state_lock); /* fake out sparse */
243		return 1;
244	}
245
246	/*
247	 * If there is not enough space left in the log to write all
248	 * potential buffers requested by this operation, we need to
249	 * stall pending a log checkpoint to free some more log space.
250	 */
251	needed = atomic_add_return(total, &t->t_outstanding_credits);
252	if (needed > journal->j_max_transaction_buffers) {
253		/*
254		 * If the current transaction is already too large,
255		 * then start to commit it: we can then go back and
256		 * attach this handle to a new transaction.
257		 */
258		atomic_sub(total, &t->t_outstanding_credits);
259
260		/*
261		 * Is the number of reserved credits in the current transaction too
262		 * big to fit this handle? Wait until reserved credits are freed.
263		 */
264		if (atomic_read(&journal->j_reserved_credits) + total >
265		    journal->j_max_transaction_buffers) {
266			read_unlock(&journal->j_state_lock);
267			jbd2_might_wait_for_commit(journal);
268			wait_event(journal->j_wait_reserved,
269				   atomic_read(&journal->j_reserved_credits) + total <=
270				   journal->j_max_transaction_buffers);
271			__acquire(&journal->j_state_lock); /* fake out sparse */
272			return 1;
273		}
274
275		wait_transaction_locked(journal);
276		__acquire(&journal->j_state_lock); /* fake out sparse */
277		return 1;
278	}
279
280	/*
281	 * The commit code assumes that it can get enough log space
282	 * without forcing a checkpoint.  This is *critical* for
283	 * correctness: a checkpoint of a buffer which is also
284	 * associated with a committing transaction creates a deadlock,
285	 * so commit simply cannot force through checkpoints.
286	 *
287	 * We must therefore ensure the necessary space in the journal
288	 * *before* starting to dirty potentially checkpointed buffers
289	 * in the new transaction.
290	 */
291	if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
292		atomic_sub(total, &t->t_outstanding_credits);
293		read_unlock(&journal->j_state_lock);
294		jbd2_might_wait_for_commit(journal);
295		write_lock(&journal->j_state_lock);
296		if (jbd2_log_space_left(journal) <
297					journal->j_max_transaction_buffers)
298			__jbd2_log_wait_for_space(journal);
299		write_unlock(&journal->j_state_lock);
300		__acquire(&journal->j_state_lock); /* fake out sparse */
301		return 1;
302	}
303
304	/* No reservation? We are done... */
305	if (!rsv_blocks)
306		return 0;
307
308	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
309	/* We allow at most half of a transaction to be reserved */
310	if (needed > journal->j_max_transaction_buffers / 2) {
311		sub_reserved_credits(journal, rsv_blocks);
312		atomic_sub(total, &t->t_outstanding_credits);
313		read_unlock(&journal->j_state_lock);
314		jbd2_might_wait_for_commit(journal);
315		wait_event(journal->j_wait_reserved,
316			 atomic_read(&journal->j_reserved_credits) + rsv_blocks
317			 <= journal->j_max_transaction_buffers / 2);
318		__acquire(&journal->j_state_lock); /* fake out sparse */
319		return 1;
320	}
321	return 0;
322}
323
324/*
325 * start_this_handle: Given a handle, deal with any locking or stalling
326 * needed to make sure that there is enough journal space for the handle
327 * to begin.  Attach the handle to a transaction and set up the
328 * transaction's buffer credits.
329 */
330
331static int start_this_handle(journal_t *journal, handle_t *handle,
332			     gfp_t gfp_mask)
333{
334	transaction_t	*transaction, *new_transaction = NULL;
335	int		blocks = handle->h_total_credits;
336	int		rsv_blocks = 0;
337	unsigned long ts = jiffies;
338
339	if (handle->h_rsv_handle)
340		rsv_blocks = handle->h_rsv_handle->h_total_credits;
341
342	/*
343	 * Limit the number of reserved credits to 1/2 of maximum transaction
344	 * size and limit the number of total credits to not exceed maximum
345	 * transaction size per operation.
346	 */
347	if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
348	    (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
349		printk(KERN_ERR "JBD2: %s wants too many credits "
350		       "credits:%d rsv_credits:%d max:%d\n",
351		       current->comm, blocks, rsv_blocks,
352		       journal->j_max_transaction_buffers);
353		WARN_ON(1);
354		return -ENOSPC;
355	}
356
357alloc_transaction:
358	/*
359	 * This check is racy but it is just an optimization of allocating new
360	 * transaction early if there are high chances we'll need it. If we
361	 * guess wrong, we'll retry or free unused transaction.
362	 */
363	if (!data_race(journal->j_running_transaction)) {
364		/*
365		 * If __GFP_FS is not present, then we may be being called from
366		 * inside the fs writeback layer, so we MUST NOT fail.
367		 */
368		if ((gfp_mask & __GFP_FS) == 0)
369			gfp_mask |= __GFP_NOFAIL;
370		new_transaction = kmem_cache_zalloc(transaction_cache,
371						    gfp_mask);
372		if (!new_transaction)
373			return -ENOMEM;
374	}
375
376	jbd2_debug(3, "New handle %p going live.\n", handle);
377
378	/*
379	 * We need to hold j_state_lock until t_updates has been incremented,
380	 * for proper journal barrier handling
381	 */
382repeat:
383	read_lock(&journal->j_state_lock);
384	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
385	if (is_journal_aborted(journal) ||
386	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
387		read_unlock(&journal->j_state_lock);
388		jbd2_journal_free_transaction(new_transaction);
389		return -EROFS;
390	}
391
392	/*
393	 * Wait on the journal's transaction barrier if necessary. Specifically
394	 * we allow reserved handles to proceed because otherwise commit could
395	 * deadlock on page writeback not being able to complete.
396	 */
397	if (!handle->h_reserved && journal->j_barrier_count) {
398		read_unlock(&journal->j_state_lock);
399		wait_event(journal->j_wait_transaction_locked,
400				journal->j_barrier_count == 0);
401		goto repeat;
402	}
403
404	if (!journal->j_running_transaction) {
405		read_unlock(&journal->j_state_lock);
406		if (!new_transaction)
407			goto alloc_transaction;
408		write_lock(&journal->j_state_lock);
409		if (!journal->j_running_transaction &&
410		    (handle->h_reserved || !journal->j_barrier_count)) {
411			jbd2_get_transaction(journal, new_transaction);
412			new_transaction = NULL;
413		}
414		write_unlock(&journal->j_state_lock);
415		goto repeat;
416	}
417
418	transaction = journal->j_running_transaction;
419
420	if (!handle->h_reserved) {
421		/* We may have dropped j_state_lock - restart in that case */
422		if (add_transaction_credits(journal, blocks, rsv_blocks)) {
423			/*
424			 * add_transaction_credits releases
425			 * j_state_lock on a non-zero return
426			 */
427			__release(&journal->j_state_lock);
428			goto repeat;
429		}
430	} else {
431		/*
432		 * We have handle reserved so we are allowed to join T_LOCKED
433		 * transaction and we don't have to check for transaction size
434		 * and journal space. But we still have to wait while running
435		 * transaction is being switched to a committing one as it
436		 * won't wait for any handles anymore.
437		 */
438		if (transaction->t_state == T_SWITCH) {
439			wait_transaction_switching(journal);
440			goto repeat;
441		}
442		sub_reserved_credits(journal, blocks);
443		handle->h_reserved = 0;
444	}
445
446	/* OK, account for the buffers that this operation expects to
447	 * use and add the handle to the running transaction.
448	 */
449	update_t_max_wait(transaction, ts);
450	handle->h_transaction = transaction;
451	handle->h_requested_credits = blocks;
452	handle->h_revoke_credits_requested = handle->h_revoke_credits;
453	handle->h_start_jiffies = jiffies;
454	atomic_inc(&transaction->t_updates);
455	atomic_inc(&transaction->t_handle_count);
456	jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
457		  handle, blocks,
458		  atomic_read(&transaction->t_outstanding_credits),
459		  jbd2_log_space_left(journal));
460	read_unlock(&journal->j_state_lock);
461	current->journal_info = handle;
462
463	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
464	jbd2_journal_free_transaction(new_transaction);
465	/*
466	 * Ensure that no allocations done while the transaction is open are
467	 * going to recurse back to the fs layer.
468	 */
469	handle->saved_alloc_context = memalloc_nofs_save();
470	return 0;
471}
472
473/* Allocate a new handle.  This should probably be in a slab... */
474static handle_t *new_handle(int nblocks)
475{
476	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
477	if (!handle)
478		return NULL;
479	handle->h_total_credits = nblocks;
480	handle->h_ref = 1;
481
482	return handle;
483}
484
485handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
486			      int revoke_records, gfp_t gfp_mask,
487			      unsigned int type, unsigned int line_no)
488{
489	handle_t *handle = journal_current_handle();
490	int err;
491
492	if (!journal)
493		return ERR_PTR(-EROFS);
494
495	if (handle) {
496		J_ASSERT(handle->h_transaction->t_journal == journal);
497		handle->h_ref++;
498		return handle;
499	}
500
501	nblocks += DIV_ROUND_UP(revoke_records,
502				journal->j_revoke_records_per_block);
503	handle = new_handle(nblocks);
504	if (!handle)
505		return ERR_PTR(-ENOMEM);
506	if (rsv_blocks) {
507		handle_t *rsv_handle;
508
509		rsv_handle = new_handle(rsv_blocks);
510		if (!rsv_handle) {
511			jbd2_free_handle(handle);
512			return ERR_PTR(-ENOMEM);
513		}
514		rsv_handle->h_reserved = 1;
515		rsv_handle->h_journal = journal;
516		handle->h_rsv_handle = rsv_handle;
517	}
518	handle->h_revoke_credits = revoke_records;
519
520	err = start_this_handle(journal, handle, gfp_mask);
521	if (err < 0) {
522		if (handle->h_rsv_handle)
523			jbd2_free_handle(handle->h_rsv_handle);
524		jbd2_free_handle(handle);
525		return ERR_PTR(err);
526	}
527	handle->h_type = type;
528	handle->h_line_no = line_no;
529	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
530				handle->h_transaction->t_tid, type,
531				line_no, nblocks);
532
533	return handle;
534}
535EXPORT_SYMBOL(jbd2__journal_start);
536
537
538/**
539 * jbd2_journal_start() - Obtain a new handle.
540 * @journal: Journal to start transaction on.
541 * @nblocks: number of block buffer we might modify
542 *
543 * We make sure that the transaction can guarantee at least nblocks of
544 * modified buffers in the log.  We block until the log can guarantee
545 * that much space. Additionally, if rsv_blocks > 0, we also create another
546 * handle with rsv_blocks reserved blocks in the journal. This handle is
547 * stored in h_rsv_handle. It is not attached to any particular transaction
548 * and thus doesn't block transaction commit. If the caller uses this reserved
549 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
550 * on the parent handle will dispose the reserved one. Reserved handle has to
551 * be converted to a normal handle using jbd2_journal_start_reserved() before
552 * it can be used.
553 *
554 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
555 * on failure.
556 */
557handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
558{
559	return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
560}
561EXPORT_SYMBOL(jbd2_journal_start);
562
563static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
564{
565	journal_t *journal = handle->h_journal;
566
567	WARN_ON(!handle->h_reserved);
568	sub_reserved_credits(journal, handle->h_total_credits);
569	if (t)
570		atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
571}
572
573void jbd2_journal_free_reserved(handle_t *handle)
574{
575	journal_t *journal = handle->h_journal;
576
577	/* Get j_state_lock to pin running transaction if it exists */
578	read_lock(&journal->j_state_lock);
579	__jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
580	read_unlock(&journal->j_state_lock);
581	jbd2_free_handle(handle);
582}
583EXPORT_SYMBOL(jbd2_journal_free_reserved);
584
585/**
586 * jbd2_journal_start_reserved() - start reserved handle
587 * @handle: handle to start
588 * @type: for handle statistics
589 * @line_no: for handle statistics
590 *
591 * Start handle that has been previously reserved with jbd2_journal_reserve().
592 * This attaches @handle to the running transaction (or creates one if there's
593 * not transaction running). Unlike jbd2_journal_start() this function cannot
594 * block on journal commit, checkpointing, or similar stuff. It can block on
595 * memory allocation or frozen journal though.
596 *
597 * Return 0 on success, non-zero on error - handle is freed in that case.
598 */
599int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
600				unsigned int line_no)
601{
602	journal_t *journal = handle->h_journal;
603	int ret = -EIO;
604
605	if (WARN_ON(!handle->h_reserved)) {
606		/* Someone passed in normal handle? Just stop it. */
607		jbd2_journal_stop(handle);
608		return ret;
609	}
610	/*
611	 * Usefulness of mixing of reserved and unreserved handles is
612	 * questionable. So far nobody seems to need it so just error out.
613	 */
614	if (WARN_ON(current->journal_info)) {
615		jbd2_journal_free_reserved(handle);
616		return ret;
617	}
618
619	handle->h_journal = NULL;
620	/*
621	 * GFP_NOFS is here because callers are likely from writeback or
622	 * similarly constrained call sites
623	 */
624	ret = start_this_handle(journal, handle, GFP_NOFS);
625	if (ret < 0) {
626		handle->h_journal = journal;
627		jbd2_journal_free_reserved(handle);
628		return ret;
629	}
630	handle->h_type = type;
631	handle->h_line_no = line_no;
632	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
633				handle->h_transaction->t_tid, type,
634				line_no, handle->h_total_credits);
635	return 0;
636}
637EXPORT_SYMBOL(jbd2_journal_start_reserved);
638
639/**
640 * jbd2_journal_extend() - extend buffer credits.
641 * @handle:  handle to 'extend'
642 * @nblocks: nr blocks to try to extend by.
643 * @revoke_records: number of revoke records to try to extend by.
644 *
645 * Some transactions, such as large extends and truncates, can be done
646 * atomically all at once or in several stages.  The operation requests
647 * a credit for a number of buffer modifications in advance, but can
648 * extend its credit if it needs more.
649 *
650 * jbd2_journal_extend tries to give the running handle more buffer credits.
651 * It does not guarantee that allocation - this is a best-effort only.
652 * The calling process MUST be able to deal cleanly with a failure to
653 * extend here.
654 *
655 * Return 0 on success, non-zero on failure.
656 *
657 * return code < 0 implies an error
658 * return code > 0 implies normal transaction-full status.
659 */
660int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
661{
662	transaction_t *transaction = handle->h_transaction;
663	journal_t *journal;
664	int result;
665	int wanted;
666
667	if (is_handle_aborted(handle))
668		return -EROFS;
669	journal = transaction->t_journal;
670
671	result = 1;
672
673	read_lock(&journal->j_state_lock);
674
675	/* Don't extend a locked-down transaction! */
676	if (transaction->t_state != T_RUNNING) {
677		jbd2_debug(3, "denied handle %p %d blocks: "
678			  "transaction not running\n", handle, nblocks);
679		goto error_out;
680	}
681
682	nblocks += DIV_ROUND_UP(
683			handle->h_revoke_credits_requested + revoke_records,
684			journal->j_revoke_records_per_block) -
685		DIV_ROUND_UP(
686			handle->h_revoke_credits_requested,
687			journal->j_revoke_records_per_block);
688	wanted = atomic_add_return(nblocks,
689				   &transaction->t_outstanding_credits);
690
691	if (wanted > journal->j_max_transaction_buffers) {
692		jbd2_debug(3, "denied handle %p %d blocks: "
693			  "transaction too large\n", handle, nblocks);
694		atomic_sub(nblocks, &transaction->t_outstanding_credits);
695		goto error_out;
696	}
697
698	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
699				 transaction->t_tid,
700				 handle->h_type, handle->h_line_no,
701				 handle->h_total_credits,
702				 nblocks);
703
704	handle->h_total_credits += nblocks;
705	handle->h_requested_credits += nblocks;
706	handle->h_revoke_credits += revoke_records;
707	handle->h_revoke_credits_requested += revoke_records;
708	result = 0;
709
710	jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
711error_out:
712	read_unlock(&journal->j_state_lock);
713	return result;
714}
715
716static void stop_this_handle(handle_t *handle)
717{
718	transaction_t *transaction = handle->h_transaction;
719	journal_t *journal = transaction->t_journal;
720	int revokes;
721
722	J_ASSERT(journal_current_handle() == handle);
723	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
724	current->journal_info = NULL;
725	/*
726	 * Subtract necessary revoke descriptor blocks from handle credits. We
727	 * take care to account only for revoke descriptor blocks the
728	 * transaction will really need as large sequences of transactions with
729	 * small numbers of revokes are relatively common.
730	 */
731	revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
732	if (revokes) {
733		int t_revokes, revoke_descriptors;
734		int rr_per_blk = journal->j_revoke_records_per_block;
735
736		WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
737				> handle->h_total_credits);
738		t_revokes = atomic_add_return(revokes,
739				&transaction->t_outstanding_revokes);
740		revoke_descriptors =
741			DIV_ROUND_UP(t_revokes, rr_per_blk) -
742			DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
743		handle->h_total_credits -= revoke_descriptors;
744	}
745	atomic_sub(handle->h_total_credits,
746		   &transaction->t_outstanding_credits);
747	if (handle->h_rsv_handle)
748		__jbd2_journal_unreserve_handle(handle->h_rsv_handle,
749						transaction);
750	if (atomic_dec_and_test(&transaction->t_updates))
751		wake_up(&journal->j_wait_updates);
752
753	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
754	/*
755	 * Scope of the GFP_NOFS context is over here and so we can restore the
756	 * original alloc context.
757	 */
758	memalloc_nofs_restore(handle->saved_alloc_context);
759}
760
761/**
762 * jbd2__journal_restart() - restart a handle .
763 * @handle:  handle to restart
764 * @nblocks: nr credits requested
765 * @revoke_records: number of revoke record credits requested
766 * @gfp_mask: memory allocation flags (for start_this_handle)
767 *
768 * Restart a handle for a multi-transaction filesystem
769 * operation.
770 *
771 * If the jbd2_journal_extend() call above fails to grant new buffer credits
772 * to a running handle, a call to jbd2_journal_restart will commit the
773 * handle's transaction so far and reattach the handle to a new
774 * transaction capable of guaranteeing the requested number of
775 * credits. We preserve reserved handle if there's any attached to the
776 * passed in handle.
777 */
778int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
779			  gfp_t gfp_mask)
780{
781	transaction_t *transaction = handle->h_transaction;
782	journal_t *journal;
783	tid_t		tid;
784	int		need_to_start;
785	int		ret;
786
787	/* If we've had an abort of any type, don't even think about
788	 * actually doing the restart! */
789	if (is_handle_aborted(handle))
790		return 0;
791	journal = transaction->t_journal;
792	tid = transaction->t_tid;
793
794	/*
795	 * First unlink the handle from its current transaction, and start the
796	 * commit on that.
797	 */
798	jbd2_debug(2, "restarting handle %p\n", handle);
799	stop_this_handle(handle);
800	handle->h_transaction = NULL;
801
802	/*
803	 * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
804 	 * get rid of pointless j_state_lock traffic like this.
805	 */
806	read_lock(&journal->j_state_lock);
807	need_to_start = !tid_geq(journal->j_commit_request, tid);
808	read_unlock(&journal->j_state_lock);
809	if (need_to_start)
810		jbd2_log_start_commit(journal, tid);
811	handle->h_total_credits = nblocks +
812		DIV_ROUND_UP(revoke_records,
813			     journal->j_revoke_records_per_block);
814	handle->h_revoke_credits = revoke_records;
815	ret = start_this_handle(journal, handle, gfp_mask);
816	trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
817				 ret ? 0 : handle->h_transaction->t_tid,
818				 handle->h_type, handle->h_line_no,
819				 handle->h_total_credits);
820	return ret;
821}
822EXPORT_SYMBOL(jbd2__journal_restart);
823
824
825int jbd2_journal_restart(handle_t *handle, int nblocks)
826{
827	return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
828}
829EXPORT_SYMBOL(jbd2_journal_restart);
830
831/*
832 * Waits for any outstanding t_updates to finish.
833 * This is called with write j_state_lock held.
834 */
835void jbd2_journal_wait_updates(journal_t *journal)
836{
837	DEFINE_WAIT(wait);
838
839	while (1) {
840		/*
841		 * Note that the running transaction can get freed under us if
842		 * this transaction is getting committed in
843		 * jbd2_journal_commit_transaction() ->
844		 * jbd2_journal_free_transaction(). This can only happen when we
845		 * release j_state_lock -> schedule() -> acquire j_state_lock.
846		 * Hence we should everytime retrieve new j_running_transaction
847		 * value (after j_state_lock release acquire cycle), else it may
848		 * lead to use-after-free of old freed transaction.
849		 */
850		transaction_t *transaction = journal->j_running_transaction;
851
852		if (!transaction)
853			break;
854
855		prepare_to_wait(&journal->j_wait_updates, &wait,
856				TASK_UNINTERRUPTIBLE);
857		if (!atomic_read(&transaction->t_updates)) {
858			finish_wait(&journal->j_wait_updates, &wait);
859			break;
860		}
861		write_unlock(&journal->j_state_lock);
862		schedule();
863		finish_wait(&journal->j_wait_updates, &wait);
864		write_lock(&journal->j_state_lock);
865	}
866}
867
868/**
869 * jbd2_journal_lock_updates () - establish a transaction barrier.
870 * @journal:  Journal to establish a barrier on.
871 *
872 * This locks out any further updates from being started, and blocks
873 * until all existing updates have completed, returning only once the
874 * journal is in a quiescent state with no updates running.
875 *
876 * The journal lock should not be held on entry.
877 */
878void jbd2_journal_lock_updates(journal_t *journal)
879{
880	jbd2_might_wait_for_commit(journal);
881
882	write_lock(&journal->j_state_lock);
883	++journal->j_barrier_count;
884
885	/* Wait until there are no reserved handles */
886	if (atomic_read(&journal->j_reserved_credits)) {
887		write_unlock(&journal->j_state_lock);
888		wait_event(journal->j_wait_reserved,
889			   atomic_read(&journal->j_reserved_credits) == 0);
890		write_lock(&journal->j_state_lock);
891	}
892
893	/* Wait until there are no running t_updates */
894	jbd2_journal_wait_updates(journal);
895
896	write_unlock(&journal->j_state_lock);
897
898	/*
899	 * We have now established a barrier against other normal updates, but
900	 * we also need to barrier against other jbd2_journal_lock_updates() calls
901	 * to make sure that we serialise special journal-locked operations
902	 * too.
903	 */
904	mutex_lock(&journal->j_barrier);
905}
906
907/**
908 * jbd2_journal_unlock_updates () - release barrier
909 * @journal:  Journal to release the barrier on.
910 *
911 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
912 *
913 * Should be called without the journal lock held.
914 */
915void jbd2_journal_unlock_updates (journal_t *journal)
916{
917	J_ASSERT(journal->j_barrier_count != 0);
918
919	mutex_unlock(&journal->j_barrier);
920	write_lock(&journal->j_state_lock);
921	--journal->j_barrier_count;
922	write_unlock(&journal->j_state_lock);
923	wake_up_all(&journal->j_wait_transaction_locked);
924}
925
926static void warn_dirty_buffer(struct buffer_head *bh)
927{
928	printk(KERN_WARNING
929	       "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
930	       "There's a risk of filesystem corruption in case of system "
931	       "crash.\n",
932	       bh->b_bdev, (unsigned long long)bh->b_blocknr);
933}
934
935/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
936static void jbd2_freeze_jh_data(struct journal_head *jh)
937{
938	char *source;
939	struct buffer_head *bh = jh2bh(jh);
940
941	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
942	source = kmap_local_folio(bh->b_folio, bh_offset(bh));
943	/* Fire data frozen trigger just before we copy the data */
944	jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
945	memcpy(jh->b_frozen_data, source, bh->b_size);
946	kunmap_local(source);
947
948	/*
949	 * Now that the frozen data is saved off, we need to store any matching
950	 * triggers.
951	 */
952	jh->b_frozen_triggers = jh->b_triggers;
953}
954
955/*
956 * If the buffer is already part of the current transaction, then there
957 * is nothing we need to do.  If it is already part of a prior
958 * transaction which we are still committing to disk, then we need to
959 * make sure that we do not overwrite the old copy: we do copy-out to
960 * preserve the copy going to disk.  We also account the buffer against
961 * the handle's metadata buffer credits (unless the buffer is already
962 * part of the transaction, that is).
963 *
964 */
965static int
966do_get_write_access(handle_t *handle, struct journal_head *jh,
967			int force_copy)
968{
969	struct buffer_head *bh;
970	transaction_t *transaction = handle->h_transaction;
971	journal_t *journal;
972	int error;
973	char *frozen_buffer = NULL;
974	unsigned long start_lock, time_lock;
975
976	journal = transaction->t_journal;
977
978	jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
979
980	JBUFFER_TRACE(jh, "entry");
981repeat:
982	bh = jh2bh(jh);
983
984	/* @@@ Need to check for errors here at some point. */
985
986 	start_lock = jiffies;
987	lock_buffer(bh);
988	spin_lock(&jh->b_state_lock);
989
990	/* If it takes too long to lock the buffer, trace it */
991	time_lock = jbd2_time_diff(start_lock, jiffies);
992	if (time_lock > HZ/10)
993		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
994			jiffies_to_msecs(time_lock));
995
996	/* We now hold the buffer lock so it is safe to query the buffer
997	 * state.  Is the buffer dirty?
998	 *
999	 * If so, there are two possibilities.  The buffer may be
1000	 * non-journaled, and undergoing a quite legitimate writeback.
1001	 * Otherwise, it is journaled, and we don't expect dirty buffers
1002	 * in that state (the buffers should be marked JBD_Dirty
1003	 * instead.)  So either the IO is being done under our own
1004	 * control and this is a bug, or it's a third party IO such as
1005	 * dump(8) (which may leave the buffer scheduled for read ---
1006	 * ie. locked but not dirty) or tune2fs (which may actually have
1007	 * the buffer dirtied, ugh.)  */
1008
1009	if (buffer_dirty(bh) && jh->b_transaction) {
1010		warn_dirty_buffer(bh);
1011		/*
1012		 * We need to clean the dirty flag and we must do it under the
1013		 * buffer lock to be sure we don't race with running write-out.
1014		 */
1015		JBUFFER_TRACE(jh, "Journalling dirty buffer");
1016		clear_buffer_dirty(bh);
1017		/*
1018		 * The buffer is going to be added to BJ_Reserved list now and
1019		 * nothing guarantees jbd2_journal_dirty_metadata() will be
1020		 * ever called for it. So we need to set jbddirty bit here to
1021		 * make sure the buffer is dirtied and written out when the
1022		 * journaling machinery is done with it.
1023		 */
1024		set_buffer_jbddirty(bh);
1025	}
1026
1027	error = -EROFS;
1028	if (is_handle_aborted(handle)) {
1029		spin_unlock(&jh->b_state_lock);
1030		unlock_buffer(bh);
1031		goto out;
1032	}
1033	error = 0;
1034
1035	/*
1036	 * The buffer is already part of this transaction if b_transaction or
1037	 * b_next_transaction points to it
1038	 */
1039	if (jh->b_transaction == transaction ||
1040	    jh->b_next_transaction == transaction) {
1041		unlock_buffer(bh);
1042		goto done;
1043	}
1044
1045	/*
1046	 * this is the first time this transaction is touching this buffer,
1047	 * reset the modified flag
1048	 */
1049	jh->b_modified = 0;
1050
1051	/*
1052	 * If the buffer is not journaled right now, we need to make sure it
1053	 * doesn't get written to disk before the caller actually commits the
1054	 * new data
1055	 */
1056	if (!jh->b_transaction) {
1057		JBUFFER_TRACE(jh, "no transaction");
1058		J_ASSERT_JH(jh, !jh->b_next_transaction);
1059		JBUFFER_TRACE(jh, "file as BJ_Reserved");
1060		/*
1061		 * Make sure all stores to jh (b_modified, b_frozen_data) are
1062		 * visible before attaching it to the running transaction.
1063		 * Paired with barrier in jbd2_write_access_granted()
1064		 */
1065		smp_wmb();
1066		spin_lock(&journal->j_list_lock);
1067		if (test_clear_buffer_dirty(bh)) {
1068			/*
1069			 * Execute buffer dirty clearing and jh->b_transaction
1070			 * assignment under journal->j_list_lock locked to
1071			 * prevent bh being removed from checkpoint list if
1072			 * the buffer is in an intermediate state (not dirty
1073			 * and jh->b_transaction is NULL).
1074			 */
1075			JBUFFER_TRACE(jh, "Journalling dirty buffer");
1076			set_buffer_jbddirty(bh);
1077		}
1078		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1079		spin_unlock(&journal->j_list_lock);
1080		unlock_buffer(bh);
1081		goto done;
1082	}
1083	unlock_buffer(bh);
1084
1085	/*
1086	 * If there is already a copy-out version of this buffer, then we don't
1087	 * need to make another one
1088	 */
1089	if (jh->b_frozen_data) {
1090		JBUFFER_TRACE(jh, "has frozen data");
1091		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1092		goto attach_next;
1093	}
1094
1095	JBUFFER_TRACE(jh, "owned by older transaction");
1096	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1097	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
1098
1099	/*
1100	 * There is one case we have to be very careful about.  If the
1101	 * committing transaction is currently writing this buffer out to disk
1102	 * and has NOT made a copy-out, then we cannot modify the buffer
1103	 * contents at all right now.  The essence of copy-out is that it is
1104	 * the extra copy, not the primary copy, which gets journaled.  If the
1105	 * primary copy is already going to disk then we cannot do copy-out
1106	 * here.
1107	 */
1108	if (buffer_shadow(bh)) {
1109		JBUFFER_TRACE(jh, "on shadow: sleep");
1110		spin_unlock(&jh->b_state_lock);
1111		wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
1112		goto repeat;
1113	}
1114
1115	/*
1116	 * Only do the copy if the currently-owning transaction still needs it.
1117	 * If buffer isn't on BJ_Metadata list, the committing transaction is
1118	 * past that stage (here we use the fact that BH_Shadow is set under
1119	 * bh_state lock together with refiling to BJ_Shadow list and at this
1120	 * point we know the buffer doesn't have BH_Shadow set).
1121	 *
1122	 * Subtle point, though: if this is a get_undo_access, then we will be
1123	 * relying on the frozen_data to contain the new value of the
1124	 * committed_data record after the transaction, so we HAVE to force the
1125	 * frozen_data copy in that case.
1126	 */
1127	if (jh->b_jlist == BJ_Metadata || force_copy) {
1128		JBUFFER_TRACE(jh, "generate frozen data");
1129		if (!frozen_buffer) {
1130			JBUFFER_TRACE(jh, "allocate memory for buffer");
1131			spin_unlock(&jh->b_state_lock);
1132			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
1133						   GFP_NOFS | __GFP_NOFAIL);
1134			goto repeat;
1135		}
1136		jh->b_frozen_data = frozen_buffer;
1137		frozen_buffer = NULL;
1138		jbd2_freeze_jh_data(jh);
1139	}
1140attach_next:
1141	/*
1142	 * Make sure all stores to jh (b_modified, b_frozen_data) are visible
1143	 * before attaching it to the running transaction. Paired with barrier
1144	 * in jbd2_write_access_granted()
1145	 */
1146	smp_wmb();
1147	jh->b_next_transaction = transaction;
1148
1149done:
1150	spin_unlock(&jh->b_state_lock);
1151
1152	/*
1153	 * If we are about to journal a buffer, then any revoke pending on it is
1154	 * no longer valid
1155	 */
1156	jbd2_journal_cancel_revoke(handle, jh);
1157
1158out:
1159	if (unlikely(frozen_buffer))	/* It's usually NULL */
1160		jbd2_free(frozen_buffer, bh->b_size);
1161
1162	JBUFFER_TRACE(jh, "exit");
1163	return error;
1164}
1165
1166/* Fast check whether buffer is already attached to the required transaction */
1167static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
1168							bool undo)
1169{
1170	struct journal_head *jh;
1171	bool ret = false;
1172
1173	/* Dirty buffers require special handling... */
1174	if (buffer_dirty(bh))
1175		return false;
1176
1177	/*
1178	 * RCU protects us from dereferencing freed pages. So the checks we do
1179	 * are guaranteed not to oops. However the jh slab object can get freed
1180	 * & reallocated while we work with it. So we have to be careful. When
1181	 * we see jh attached to the running transaction, we know it must stay
1182	 * so until the transaction is committed. Thus jh won't be freed and
1183	 * will be attached to the same bh while we run.  However it can
1184	 * happen jh gets freed, reallocated, and attached to the transaction
1185	 * just after we get pointer to it from bh. So we have to be careful
1186	 * and recheck jh still belongs to our bh before we return success.
1187	 */
1188	rcu_read_lock();
1189	if (!buffer_jbd(bh))
1190		goto out;
1191	/* This should be bh2jh() but that doesn't work with inline functions */
1192	jh = READ_ONCE(bh->b_private);
1193	if (!jh)
1194		goto out;
1195	/* For undo access buffer must have data copied */
1196	if (undo && !jh->b_committed_data)
1197		goto out;
1198	if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
1199	    READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
1200		goto out;
1201	/*
1202	 * There are two reasons for the barrier here:
1203	 * 1) Make sure to fetch b_bh after we did previous checks so that we
1204	 * detect when jh went through free, realloc, attach to transaction
1205	 * while we were checking. Paired with implicit barrier in that path.
1206	 * 2) So that access to bh done after jbd2_write_access_granted()
1207	 * doesn't get reordered and see inconsistent state of concurrent
1208	 * do_get_write_access().
1209	 */
1210	smp_mb();
1211	if (unlikely(jh->b_bh != bh))
1212		goto out;
1213	ret = true;
1214out:
1215	rcu_read_unlock();
1216	return ret;
1217}
1218
1219/**
1220 * jbd2_journal_get_write_access() - notify intent to modify a buffer
1221 *				     for metadata (not data) update.
1222 * @handle: transaction to add buffer modifications to
1223 * @bh:     bh to be used for metadata writes
1224 *
1225 * Returns: error code or 0 on success.
1226 *
1227 * In full data journalling mode the buffer may be of type BJ_AsyncData,
1228 * because we're ``write()ing`` a buffer which is also part of a shared mapping.
1229 */
1230
1231int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
1232{
1233	struct journal_head *jh;
1234	int rc;
1235
1236	if (is_handle_aborted(handle))
1237		return -EROFS;
1238
1239	if (jbd2_write_access_granted(handle, bh, false))
1240		return 0;
1241
1242	jh = jbd2_journal_add_journal_head(bh);
1243	/* We do not want to get caught playing with fields which the
1244	 * log thread also manipulates.  Make sure that the buffer
1245	 * completes any outstanding IO before proceeding. */
1246	rc = do_get_write_access(handle, jh, 0);
1247	jbd2_journal_put_journal_head(jh);
1248	return rc;
1249}
1250
1251
1252/*
1253 * When the user wants to journal a newly created buffer_head
1254 * (ie. getblk() returned a new buffer and we are going to populate it
1255 * manually rather than reading off disk), then we need to keep the
1256 * buffer_head locked until it has been completely filled with new
1257 * data.  In this case, we should be able to make the assertion that
1258 * the bh is not already part of an existing transaction.
1259 *
1260 * The buffer should already be locked by the caller by this point.
1261 * There is no lock ranking violation: it was a newly created,
1262 * unlocked buffer beforehand. */
1263
1264/**
1265 * jbd2_journal_get_create_access () - notify intent to use newly created bh
1266 * @handle: transaction to new buffer to
1267 * @bh: new buffer.
1268 *
1269 * Call this if you create a new bh.
1270 */
1271int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1272{
1273	transaction_t *transaction = handle->h_transaction;
1274	journal_t *journal;
1275	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1276	int err;
1277
1278	jbd2_debug(5, "journal_head %p\n", jh);
1279	err = -EROFS;
1280	if (is_handle_aborted(handle))
1281		goto out;
1282	journal = transaction->t_journal;
1283	err = 0;
1284
1285	JBUFFER_TRACE(jh, "entry");
1286	/*
1287	 * The buffer may already belong to this transaction due to pre-zeroing
1288	 * in the filesystem's new_block code.  It may also be on the previous,
1289	 * committing transaction's lists, but it HAS to be in Forget state in
1290	 * that case: the transaction must have deleted the buffer for it to be
1291	 * reused here.
1292	 */
1293	spin_lock(&jh->b_state_lock);
1294	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
1295		jh->b_transaction == NULL ||
1296		(jh->b_transaction == journal->j_committing_transaction &&
1297			  jh->b_jlist == BJ_Forget)));
1298
1299	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1300	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
1301
1302	if (jh->b_transaction == NULL) {
1303		/*
1304		 * Previous jbd2_journal_forget() could have left the buffer
1305		 * with jbddirty bit set because it was being committed. When
1306		 * the commit finished, we've filed the buffer for
1307		 * checkpointing and marked it dirty. Now we are reallocating
1308		 * the buffer so the transaction freeing it must have
1309		 * committed and so it's safe to clear the dirty bit.
1310		 */
1311		clear_buffer_dirty(jh2bh(jh));
1312		/* first access by this transaction */
1313		jh->b_modified = 0;
1314
1315		JBUFFER_TRACE(jh, "file as BJ_Reserved");
1316		spin_lock(&journal->j_list_lock);
1317		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1318		spin_unlock(&journal->j_list_lock);
1319	} else if (jh->b_transaction == journal->j_committing_transaction) {
1320		/* first access by this transaction */
1321		jh->b_modified = 0;
1322
1323		JBUFFER_TRACE(jh, "set next transaction");
1324		spin_lock(&journal->j_list_lock);
1325		jh->b_next_transaction = transaction;
1326		spin_unlock(&journal->j_list_lock);
1327	}
1328	spin_unlock(&jh->b_state_lock);
1329
1330	/*
1331	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
1332	 * blocks which contain freed but then revoked metadata.  We need
1333	 * to cancel the revoke in case we end up freeing it yet again
1334	 * and the reallocating as data - this would cause a second revoke,
1335	 * which hits an assertion error.
1336	 */
1337	JBUFFER_TRACE(jh, "cancelling revoke");
1338	jbd2_journal_cancel_revoke(handle, jh);
1339out:
1340	jbd2_journal_put_journal_head(jh);
1341	return err;
1342}
1343
1344/**
1345 * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
1346 *     non-rewindable consequences
1347 * @handle: transaction
1348 * @bh: buffer to undo
1349 *
1350 * Sometimes there is a need to distinguish between metadata which has
1351 * been committed to disk and that which has not.  The ext3fs code uses
1352 * this for freeing and allocating space, we have to make sure that we
1353 * do not reuse freed space until the deallocation has been committed,
1354 * since if we overwrote that space we would make the delete
1355 * un-rewindable in case of a crash.
1356 *
1357 * To deal with that, jbd2_journal_get_undo_access requests write access to a
1358 * buffer for parts of non-rewindable operations such as delete
1359 * operations on the bitmaps.  The journaling code must keep a copy of
1360 * the buffer's contents prior to the undo_access call until such time
1361 * as we know that the buffer has definitely been committed to disk.
1362 *
1363 * We never need to know which transaction the committed data is part
1364 * of, buffers touched here are guaranteed to be dirtied later and so
1365 * will be committed to a new transaction in due course, at which point
1366 * we can discard the old committed data pointer.
1367 *
1368 * Returns error number or 0 on success.
1369 */
1370int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
1371{
1372	int err;
1373	struct journal_head *jh;
1374	char *committed_data = NULL;
1375
1376	if (is_handle_aborted(handle))
1377		return -EROFS;
1378
1379	if (jbd2_write_access_granted(handle, bh, true))
1380		return 0;
1381
1382	jh = jbd2_journal_add_journal_head(bh);
1383	JBUFFER_TRACE(jh, "entry");
1384
1385	/*
1386	 * Do this first --- it can drop the journal lock, so we want to
1387	 * make sure that obtaining the committed_data is done
1388	 * atomically wrt. completion of any outstanding commits.
1389	 */
1390	err = do_get_write_access(handle, jh, 1);
1391	if (err)
1392		goto out;
1393
1394repeat:
1395	if (!jh->b_committed_data)
1396		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
1397					    GFP_NOFS|__GFP_NOFAIL);
1398
1399	spin_lock(&jh->b_state_lock);
1400	if (!jh->b_committed_data) {
1401		/* Copy out the current buffer contents into the
1402		 * preserved, committed copy. */
1403		JBUFFER_TRACE(jh, "generate b_committed data");
1404		if (!committed_data) {
1405			spin_unlock(&jh->b_state_lock);
1406			goto repeat;
1407		}
1408
1409		jh->b_committed_data = committed_data;
1410		committed_data = NULL;
1411		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
1412	}
1413	spin_unlock(&jh->b_state_lock);
1414out:
1415	jbd2_journal_put_journal_head(jh);
1416	if (unlikely(committed_data))
1417		jbd2_free(committed_data, bh->b_size);
1418	return err;
1419}
1420
1421/**
1422 * jbd2_journal_set_triggers() - Add triggers for commit writeout
1423 * @bh: buffer to trigger on
1424 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1425 *
1426 * Set any triggers on this journal_head.  This is always safe, because
1427 * triggers for a committing buffer will be saved off, and triggers for
1428 * a running transaction will match the buffer in that transaction.
1429 *
1430 * Call with NULL to clear the triggers.
1431 */
1432void jbd2_journal_set_triggers(struct buffer_head *bh,
1433			       struct jbd2_buffer_trigger_type *type)
1434{
1435	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
1436
1437	if (WARN_ON_ONCE(!jh))
1438		return;
1439	jh->b_triggers = type;
1440	jbd2_journal_put_journal_head(jh);
1441}
1442
1443void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
1444				struct jbd2_buffer_trigger_type *triggers)
1445{
1446	struct buffer_head *bh = jh2bh(jh);
1447
1448	if (!triggers || !triggers->t_frozen)
1449		return;
1450
1451	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1452}
1453
1454void jbd2_buffer_abort_trigger(struct journal_head *jh,
1455			       struct jbd2_buffer_trigger_type *triggers)
1456{
1457	if (!triggers || !triggers->t_abort)
1458		return;
1459
1460	triggers->t_abort(triggers, jh2bh(jh));
1461}
1462
1463/**
1464 * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
1465 * @handle: transaction to add buffer to.
1466 * @bh: buffer to mark
1467 *
1468 * mark dirty metadata which needs to be journaled as part of the current
1469 * transaction.
1470 *
1471 * The buffer must have previously had jbd2_journal_get_write_access()
1472 * called so that it has a valid journal_head attached to the buffer
1473 * head.
1474 *
1475 * The buffer is placed on the transaction's metadata list and is marked
1476 * as belonging to the transaction.
1477 *
1478 * Returns error number or 0 on success.
1479 *
1480 * Special care needs to be taken if the buffer already belongs to the
1481 * current committing transaction (in which case we should have frozen
1482 * data present for that commit).  In that case, we don't relink the
1483 * buffer: that only gets done when the old transaction finally
1484 * completes its commit.
1485 */
1486int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1487{
1488	transaction_t *transaction = handle->h_transaction;
1489	journal_t *journal;
1490	struct journal_head *jh;
1491	int ret = 0;
1492
1493	if (!buffer_jbd(bh))
1494		return -EUCLEAN;
1495
1496	/*
1497	 * We don't grab jh reference here since the buffer must be part
1498	 * of the running transaction.
1499	 */
1500	jh = bh2jh(bh);
1501	jbd2_debug(5, "journal_head %p\n", jh);
1502	JBUFFER_TRACE(jh, "entry");
1503
1504	/*
1505	 * This and the following assertions are unreliable since we may see jh
1506	 * in inconsistent state unless we grab bh_state lock. But this is
1507	 * crucial to catch bugs so let's do a reliable check until the
1508	 * lockless handling is fully proven.
1509	 */
1510	if (data_race(jh->b_transaction != transaction &&
1511	    jh->b_next_transaction != transaction)) {
1512		spin_lock(&jh->b_state_lock);
1513		J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1514				jh->b_next_transaction == transaction);
1515		spin_unlock(&jh->b_state_lock);
1516	}
1517	if (jh->b_modified == 1) {
1518		/* If it's in our transaction it must be in BJ_Metadata list. */
1519		if (data_race(jh->b_transaction == transaction &&
1520		    jh->b_jlist != BJ_Metadata)) {
1521			spin_lock(&jh->b_state_lock);
1522			if (jh->b_transaction == transaction &&
1523			    jh->b_jlist != BJ_Metadata)
1524				pr_err("JBD2: assertion failure: h_type=%u "
1525				       "h_line_no=%u block_no=%llu jlist=%u\n",
1526				       handle->h_type, handle->h_line_no,
1527				       (unsigned long long) bh->b_blocknr,
1528				       jh->b_jlist);
1529			J_ASSERT_JH(jh, jh->b_transaction != transaction ||
1530					jh->b_jlist == BJ_Metadata);
1531			spin_unlock(&jh->b_state_lock);
1532		}
1533		goto out;
1534	}
1535
1536	journal = transaction->t_journal;
1537	spin_lock(&jh->b_state_lock);
1538
1539	if (is_handle_aborted(handle)) {
1540		/*
1541		 * Check journal aborting with @jh->b_state_lock locked,
1542		 * since 'jh->b_transaction' could be replaced with
1543		 * 'jh->b_next_transaction' during old transaction
1544		 * committing if journal aborted, which may fail
1545		 * assertion on 'jh->b_frozen_data == NULL'.
1546		 */
1547		ret = -EROFS;
1548		goto out_unlock_bh;
1549	}
1550
1551	if (jh->b_modified == 0) {
1552		/*
1553		 * This buffer's got modified and becoming part
1554		 * of the transaction. This needs to be done
1555		 * once a transaction -bzzz
1556		 */
1557		if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
1558			ret = -ENOSPC;
1559			goto out_unlock_bh;
1560		}
1561		jh->b_modified = 1;
1562		handle->h_total_credits--;
1563	}
1564
1565	/*
1566	 * fastpath, to avoid expensive locking.  If this buffer is already
1567	 * on the running transaction's metadata list there is nothing to do.
1568	 * Nobody can take it off again because there is a handle open.
1569	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1570	 * result in this test being false, so we go in and take the locks.
1571	 */
1572	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1573		JBUFFER_TRACE(jh, "fastpath");
1574		if (unlikely(jh->b_transaction !=
1575			     journal->j_running_transaction)) {
1576			printk(KERN_ERR "JBD2: %s: "
1577			       "jh->b_transaction (%llu, %p, %u) != "
1578			       "journal->j_running_transaction (%p, %u)\n",
1579			       journal->j_devname,
1580			       (unsigned long long) bh->b_blocknr,
1581			       jh->b_transaction,
1582			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
1583			       journal->j_running_transaction,
1584			       journal->j_running_transaction ?
1585			       journal->j_running_transaction->t_tid : 0);
1586			ret = -EINVAL;
1587		}
1588		goto out_unlock_bh;
1589	}
1590
1591	set_buffer_jbddirty(bh);
1592
1593	/*
1594	 * Metadata already on the current transaction list doesn't
1595	 * need to be filed.  Metadata on another transaction's list must
1596	 * be committing, and will be refiled once the commit completes:
1597	 * leave it alone for now.
1598	 */
1599	if (jh->b_transaction != transaction) {
1600		JBUFFER_TRACE(jh, "already on other transaction");
1601		if (unlikely(((jh->b_transaction !=
1602			       journal->j_committing_transaction)) ||
1603			     (jh->b_next_transaction != transaction))) {
1604			printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1605			       "bad jh for block %llu: "
1606			       "transaction (%p, %u), "
1607			       "jh->b_transaction (%p, %u), "
1608			       "jh->b_next_transaction (%p, %u), jlist %u\n",
1609			       journal->j_devname,
1610			       (unsigned long long) bh->b_blocknr,
1611			       transaction, transaction->t_tid,
1612			       jh->b_transaction,
1613			       jh->b_transaction ?
1614			       jh->b_transaction->t_tid : 0,
1615			       jh->b_next_transaction,
1616			       jh->b_next_transaction ?
1617			       jh->b_next_transaction->t_tid : 0,
1618			       jh->b_jlist);
1619			WARN_ON(1);
1620			ret = -EINVAL;
1621		}
1622		/* And this case is illegal: we can't reuse another
1623		 * transaction's data buffer, ever. */
1624		goto out_unlock_bh;
1625	}
1626
1627	/* That test should have eliminated the following case: */
1628	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1629
1630	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1631	spin_lock(&journal->j_list_lock);
1632	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1633	spin_unlock(&journal->j_list_lock);
1634out_unlock_bh:
1635	spin_unlock(&jh->b_state_lock);
1636out:
1637	JBUFFER_TRACE(jh, "exit");
1638	return ret;
1639}
1640
1641/**
1642 * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1643 * @handle: transaction handle
1644 * @bh:     bh to 'forget'
1645 *
1646 * We can only do the bforget if there are no commits pending against the
1647 * buffer.  If the buffer is dirty in the current running transaction we
1648 * can safely unlink it.
1649 *
1650 * bh may not be a journalled buffer at all - it may be a non-JBD
1651 * buffer which came off the hashtable.  Check for this.
1652 *
1653 * Decrements bh->b_count by one.
1654 *
1655 * Allow this call even if the handle has aborted --- it may be part of
1656 * the caller's cleanup after an abort.
1657 */
1658int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
1659{
1660	transaction_t *transaction = handle->h_transaction;
1661	journal_t *journal;
1662	struct journal_head *jh;
1663	int drop_reserve = 0;
1664	int err = 0;
1665	int was_modified = 0;
1666
1667	if (is_handle_aborted(handle))
1668		return -EROFS;
1669	journal = transaction->t_journal;
1670
1671	BUFFER_TRACE(bh, "entry");
1672
1673	jh = jbd2_journal_grab_journal_head(bh);
1674	if (!jh) {
1675		__bforget(bh);
1676		return 0;
1677	}
1678
1679	spin_lock(&jh->b_state_lock);
1680
1681	/* Critical error: attempting to delete a bitmap buffer, maybe?
1682	 * Don't do any jbd operations, and return an error. */
1683	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1684			 "inconsistent data on disk")) {
1685		err = -EIO;
1686		goto drop;
1687	}
1688
1689	/* keep track of whether or not this transaction modified us */
1690	was_modified = jh->b_modified;
1691
1692	/*
1693	 * The buffer's going from the transaction, we must drop
1694	 * all references -bzzz
1695	 */
1696	jh->b_modified = 0;
1697
1698	if (jh->b_transaction == transaction) {
1699		J_ASSERT_JH(jh, !jh->b_frozen_data);
1700
1701		/* If we are forgetting a buffer which is already part
1702		 * of this transaction, then we can just drop it from
1703		 * the transaction immediately. */
1704		clear_buffer_dirty(bh);
1705		clear_buffer_jbddirty(bh);
1706
1707		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1708
1709		/*
1710		 * we only want to drop a reference if this transaction
1711		 * modified the buffer
1712		 */
1713		if (was_modified)
1714			drop_reserve = 1;
1715
1716		/*
1717		 * We are no longer going to journal this buffer.
1718		 * However, the commit of this transaction is still
1719		 * important to the buffer: the delete that we are now
1720		 * processing might obsolete an old log entry, so by
1721		 * committing, we can satisfy the buffer's checkpoint.
1722		 *
1723		 * So, if we have a checkpoint on the buffer, we should
1724		 * now refile the buffer on our BJ_Forget list so that
1725		 * we know to remove the checkpoint after we commit.
1726		 */
1727
1728		spin_lock(&journal->j_list_lock);
1729		if (jh->b_cp_transaction) {
1730			__jbd2_journal_temp_unlink_buffer(jh);
1731			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1732		} else {
1733			__jbd2_journal_unfile_buffer(jh);
1734			jbd2_journal_put_journal_head(jh);
1735		}
1736		spin_unlock(&journal->j_list_lock);
1737	} else if (jh->b_transaction) {
1738		J_ASSERT_JH(jh, (jh->b_transaction ==
1739				 journal->j_committing_transaction));
1740		/* However, if the buffer is still owned by a prior
1741		 * (committing) transaction, we can't drop it yet... */
1742		JBUFFER_TRACE(jh, "belongs to older transaction");
1743		/* ... but we CAN drop it from the new transaction through
1744		 * marking the buffer as freed and set j_next_transaction to
1745		 * the new transaction, so that not only the commit code
1746		 * knows it should clear dirty bits when it is done with the
1747		 * buffer, but also the buffer can be checkpointed only
1748		 * after the new transaction commits. */
1749
1750		set_buffer_freed(bh);
1751
1752		if (!jh->b_next_transaction) {
1753			spin_lock(&journal->j_list_lock);
1754			jh->b_next_transaction = transaction;
1755			spin_unlock(&journal->j_list_lock);
1756		} else {
1757			J_ASSERT(jh->b_next_transaction == transaction);
1758
1759			/*
1760			 * only drop a reference if this transaction modified
1761			 * the buffer
1762			 */
1763			if (was_modified)
1764				drop_reserve = 1;
1765		}
1766	} else {
1767		/*
1768		 * Finally, if the buffer is not belongs to any
1769		 * transaction, we can just drop it now if it has no
1770		 * checkpoint.
1771		 */
1772		spin_lock(&journal->j_list_lock);
1773		if (!jh->b_cp_transaction) {
1774			JBUFFER_TRACE(jh, "belongs to none transaction");
1775			spin_unlock(&journal->j_list_lock);
1776			goto drop;
1777		}
1778
1779		/*
1780		 * Otherwise, if the buffer has been written to disk,
1781		 * it is safe to remove the checkpoint and drop it.
1782		 */
1783		if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
1784			spin_unlock(&journal->j_list_lock);
1785			goto drop;
1786		}
1787
1788		/*
1789		 * The buffer is still not written to disk, we should
1790		 * attach this buffer to current transaction so that the
1791		 * buffer can be checkpointed only after the current
1792		 * transaction commits.
1793		 */
1794		clear_buffer_dirty(bh);
1795		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1796		spin_unlock(&journal->j_list_lock);
1797	}
1798drop:
1799	__brelse(bh);
1800	spin_unlock(&jh->b_state_lock);
1801	jbd2_journal_put_journal_head(jh);
1802	if (drop_reserve) {
1803		/* no need to reserve log space for this block -bzzz */
1804		handle->h_total_credits++;
1805	}
1806	return err;
1807}
1808
1809/**
1810 * jbd2_journal_stop() - complete a transaction
1811 * @handle: transaction to complete.
1812 *
1813 * All done for a particular handle.
1814 *
1815 * There is not much action needed here.  We just return any remaining
1816 * buffer credits to the transaction and remove the handle.  The only
1817 * complication is that we need to start a commit operation if the
1818 * filesystem is marked for synchronous update.
1819 *
1820 * jbd2_journal_stop itself will not usually return an error, but it may
1821 * do so in unusual circumstances.  In particular, expect it to
1822 * return -EIO if a jbd2_journal_abort has been executed since the
1823 * transaction began.
1824 */
1825int jbd2_journal_stop(handle_t *handle)
1826{
1827	transaction_t *transaction = handle->h_transaction;
1828	journal_t *journal;
1829	int err = 0, wait_for_commit = 0;
1830	tid_t tid;
1831	pid_t pid;
1832
1833	if (--handle->h_ref > 0) {
1834		jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1835						 handle->h_ref);
1836		if (is_handle_aborted(handle))
1837			return -EIO;
1838		return 0;
1839	}
1840	if (!transaction) {
1841		/*
1842		 * Handle is already detached from the transaction so there is
1843		 * nothing to do other than free the handle.
1844		 */
1845		memalloc_nofs_restore(handle->saved_alloc_context);
1846		goto free_and_exit;
1847	}
1848	journal = transaction->t_journal;
1849	tid = transaction->t_tid;
1850
1851	if (is_handle_aborted(handle))
1852		err = -EIO;
1853
1854	jbd2_debug(4, "Handle %p going down\n", handle);
1855	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
1856				tid, handle->h_type, handle->h_line_no,
1857				jiffies - handle->h_start_jiffies,
1858				handle->h_sync, handle->h_requested_credits,
1859				(handle->h_requested_credits -
1860				 handle->h_total_credits));
1861
1862	/*
1863	 * Implement synchronous transaction batching.  If the handle
1864	 * was synchronous, don't force a commit immediately.  Let's
1865	 * yield and let another thread piggyback onto this
1866	 * transaction.  Keep doing that while new threads continue to
1867	 * arrive.  It doesn't cost much - we're about to run a commit
1868	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
1869	 * operations by 30x or more...
1870	 *
1871	 * We try and optimize the sleep time against what the
1872	 * underlying disk can do, instead of having a static sleep
1873	 * time.  This is useful for the case where our storage is so
1874	 * fast that it is more optimal to go ahead and force a flush
1875	 * and wait for the transaction to be committed than it is to
1876	 * wait for an arbitrary amount of time for new writers to
1877	 * join the transaction.  We achieve this by measuring how
1878	 * long it takes to commit a transaction, and compare it with
1879	 * how long this transaction has been running, and if run time
1880	 * < commit time then we sleep for the delta and commit.  This
1881	 * greatly helps super fast disks that would see slowdowns as
1882	 * more threads started doing fsyncs.
1883	 *
1884	 * But don't do this if this process was the most recent one
1885	 * to perform a synchronous write.  We do this to detect the
1886	 * case where a single process is doing a stream of sync
1887	 * writes.  No point in waiting for joiners in that case.
1888	 *
1889	 * Setting max_batch_time to 0 disables this completely.
1890	 */
1891	pid = current->pid;
1892	if (handle->h_sync && journal->j_last_sync_writer != pid &&
1893	    journal->j_max_batch_time) {
1894		u64 commit_time, trans_time;
1895
1896		journal->j_last_sync_writer = pid;
1897
1898		read_lock(&journal->j_state_lock);
1899		commit_time = journal->j_average_commit_time;
1900		read_unlock(&journal->j_state_lock);
1901
1902		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1903						   transaction->t_start_time));
1904
1905		commit_time = max_t(u64, commit_time,
1906				    1000*journal->j_min_batch_time);
1907		commit_time = min_t(u64, commit_time,
1908				    1000*journal->j_max_batch_time);
1909
1910		if (trans_time < commit_time) {
1911			ktime_t expires = ktime_add_ns(ktime_get(),
1912						       commit_time);
1913			set_current_state(TASK_UNINTERRUPTIBLE);
1914			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1915		}
1916	}
1917
1918	if (handle->h_sync)
1919		transaction->t_synchronous_commit = 1;
1920
1921	/*
1922	 * If the handle is marked SYNC, we need to set another commit
1923	 * going!  We also want to force a commit if the transaction is too
1924	 * old now.
1925	 */
1926	if (handle->h_sync ||
1927	    time_after_eq(jiffies, transaction->t_expires)) {
1928		/* Do this even for aborted journals: an abort still
1929		 * completes the commit thread, it just doesn't write
1930		 * anything to disk. */
1931
1932		jbd2_debug(2, "transaction too old, requesting commit for "
1933					"handle %p\n", handle);
1934		/* This is non-blocking */
1935		jbd2_log_start_commit(journal, tid);
1936
1937		/*
1938		 * Special case: JBD2_SYNC synchronous updates require us
1939		 * to wait for the commit to complete.
1940		 */
1941		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1942			wait_for_commit = 1;
1943	}
1944
1945	/*
1946	 * Once stop_this_handle() drops t_updates, the transaction could start
1947	 * committing on us and eventually disappear.  So we must not
1948	 * dereference transaction pointer again after calling
1949	 * stop_this_handle().
1950	 */
1951	stop_this_handle(handle);
1952
1953	if (wait_for_commit)
1954		err = jbd2_log_wait_commit(journal, tid);
1955
1956free_and_exit:
1957	if (handle->h_rsv_handle)
1958		jbd2_free_handle(handle->h_rsv_handle);
1959	jbd2_free_handle(handle);
1960	return err;
1961}
1962
1963/*
1964 *
1965 * List management code snippets: various functions for manipulating the
1966 * transaction buffer lists.
1967 *
1968 */
1969
1970/*
1971 * Append a buffer to a transaction list, given the transaction's list head
1972 * pointer.
1973 *
1974 * j_list_lock is held.
1975 *
1976 * jh->b_state_lock is held.
1977 */
1978
1979static inline void
1980__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1981{
1982	if (!*list) {
1983		jh->b_tnext = jh->b_tprev = jh;
1984		*list = jh;
1985	} else {
1986		/* Insert at the tail of the list to preserve order */
1987		struct journal_head *first = *list, *last = first->b_tprev;
1988		jh->b_tprev = last;
1989		jh->b_tnext = first;
1990		last->b_tnext = first->b_tprev = jh;
1991	}
1992}
1993
1994/*
1995 * Remove a buffer from a transaction list, given the transaction's list
1996 * head pointer.
1997 *
1998 * Called with j_list_lock held, and the journal may not be locked.
1999 *
2000 * jh->b_state_lock is held.
2001 */
2002
2003static inline void
2004__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
2005{
2006	if (*list == jh) {
2007		*list = jh->b_tnext;
2008		if (*list == jh)
2009			*list = NULL;
2010	}
2011	jh->b_tprev->b_tnext = jh->b_tnext;
2012	jh->b_tnext->b_tprev = jh->b_tprev;
2013}
2014
2015/*
2016 * Remove a buffer from the appropriate transaction list.
2017 *
2018 * Note that this function can *change* the value of
2019 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
2020 * t_reserved_list.  If the caller is holding onto a copy of one of these
2021 * pointers, it could go bad.  Generally the caller needs to re-read the
2022 * pointer from the transaction_t.
2023 *
2024 * Called under j_list_lock.
2025 */
2026static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
2027{
2028	struct journal_head **list = NULL;
2029	transaction_t *transaction;
2030	struct buffer_head *bh = jh2bh(jh);
2031
2032	lockdep_assert_held(&jh->b_state_lock);
2033	transaction = jh->b_transaction;
2034	if (transaction)
2035		assert_spin_locked(&transaction->t_journal->j_list_lock);
2036
2037	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2038	if (jh->b_jlist != BJ_None)
2039		J_ASSERT_JH(jh, transaction != NULL);
2040
2041	switch (jh->b_jlist) {
2042	case BJ_None:
2043		return;
2044	case BJ_Metadata:
2045		transaction->t_nr_buffers--;
2046		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
2047		list = &transaction->t_buffers;
2048		break;
2049	case BJ_Forget:
2050		list = &transaction->t_forget;
2051		break;
2052	case BJ_Shadow:
2053		list = &transaction->t_shadow_list;
2054		break;
2055	case BJ_Reserved:
2056		list = &transaction->t_reserved_list;
2057		break;
2058	}
2059
2060	__blist_del_buffer(list, jh);
2061	jh->b_jlist = BJ_None;
2062	if (transaction && is_journal_aborted(transaction->t_journal))
2063		clear_buffer_jbddirty(bh);
2064	else if (test_clear_buffer_jbddirty(bh))
2065		mark_buffer_dirty(bh);	/* Expose it to the VM */
2066}
2067
2068/*
2069 * Remove buffer from all transactions. The caller is responsible for dropping
2070 * the jh reference that belonged to the transaction.
2071 *
2072 * Called with bh_state lock and j_list_lock
2073 */
2074static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
2075{
2076	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2077	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2078
2079	__jbd2_journal_temp_unlink_buffer(jh);
2080	jh->b_transaction = NULL;
2081}
2082
2083void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
2084{
2085	struct buffer_head *bh = jh2bh(jh);
2086
2087	/* Get reference so that buffer cannot be freed before we unlock it */
2088	get_bh(bh);
2089	spin_lock(&jh->b_state_lock);
2090	spin_lock(&journal->j_list_lock);
2091	__jbd2_journal_unfile_buffer(jh);
2092	spin_unlock(&journal->j_list_lock);
2093	spin_unlock(&jh->b_state_lock);
2094	jbd2_journal_put_journal_head(jh);
2095	__brelse(bh);
2096}
2097
2098/**
2099 * jbd2_journal_try_to_free_buffers() - try to free page buffers.
2100 * @journal: journal for operation
2101 * @folio: Folio to detach data from.
2102 *
2103 * For all the buffers on this page,
2104 * if they are fully written out ordered data, move them onto BUF_CLEAN
2105 * so try_to_free_buffers() can reap them.
2106 *
2107 * This function returns non-zero if we wish try_to_free_buffers()
2108 * to be called. We do this if the page is releasable by try_to_free_buffers().
2109 * We also do it if the page has locked or dirty buffers and the caller wants
2110 * us to perform sync or async writeout.
2111 *
2112 * This complicates JBD locking somewhat.  We aren't protected by the
2113 * BKL here.  We wish to remove the buffer from its committing or
2114 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
2115 *
2116 * This may *change* the value of transaction_t->t_datalist, so anyone
2117 * who looks at t_datalist needs to lock against this function.
2118 *
2119 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
2120 * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
2121 * will come out of the lock with the buffer dirty, which makes it
2122 * ineligible for release here.
2123 *
2124 * Who else is affected by this?  hmm...  Really the only contender
2125 * is do_get_write_access() - it could be looking at the buffer while
2126 * journal_try_to_free_buffer() is changing its state.  But that
2127 * cannot happen because we never reallocate freed data as metadata
2128 * while the data is part of a transaction.  Yes?
2129 *
2130 * Return false on failure, true on success
2131 */
2132bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
2133{
2134	struct buffer_head *head;
2135	struct buffer_head *bh;
2136	bool ret = false;
2137
2138	J_ASSERT(folio_test_locked(folio));
2139
2140	head = folio_buffers(folio);
2141	bh = head;
2142	do {
2143		struct journal_head *jh;
2144
2145		/*
2146		 * We take our own ref against the journal_head here to avoid
2147		 * having to add tons of locking around each instance of
2148		 * jbd2_journal_put_journal_head().
2149		 */
2150		jh = jbd2_journal_grab_journal_head(bh);
2151		if (!jh)
2152			continue;
2153
2154		spin_lock(&jh->b_state_lock);
2155		if (!jh->b_transaction && !jh->b_next_transaction) {
2156			spin_lock(&journal->j_list_lock);
2157			/* Remove written-back checkpointed metadata buffer */
2158			if (jh->b_cp_transaction != NULL)
2159				jbd2_journal_try_remove_checkpoint(jh);
2160			spin_unlock(&journal->j_list_lock);
2161		}
2162		spin_unlock(&jh->b_state_lock);
2163		jbd2_journal_put_journal_head(jh);
2164		if (buffer_jbd(bh))
2165			goto busy;
2166	} while ((bh = bh->b_this_page) != head);
2167
2168	ret = try_to_free_buffers(folio);
2169busy:
2170	return ret;
2171}
2172
2173/*
2174 * This buffer is no longer needed.  If it is on an older transaction's
2175 * checkpoint list we need to record it on this transaction's forget list
2176 * to pin this buffer (and hence its checkpointing transaction) down until
2177 * this transaction commits.  If the buffer isn't on a checkpoint list, we
2178 * release it.
2179 * Returns non-zero if JBD no longer has an interest in the buffer.
2180 *
2181 * Called under j_list_lock.
2182 *
2183 * Called under jh->b_state_lock.
2184 */
2185static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
2186{
2187	int may_free = 1;
2188	struct buffer_head *bh = jh2bh(jh);
2189
2190	if (jh->b_cp_transaction) {
2191		JBUFFER_TRACE(jh, "on running+cp transaction");
2192		__jbd2_journal_temp_unlink_buffer(jh);
2193		/*
2194		 * We don't want to write the buffer anymore, clear the
2195		 * bit so that we don't confuse checks in
2196		 * __journal_file_buffer
2197		 */
2198		clear_buffer_dirty(bh);
2199		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
2200		may_free = 0;
2201	} else {
2202		JBUFFER_TRACE(jh, "on running transaction");
2203		__jbd2_journal_unfile_buffer(jh);
2204		jbd2_journal_put_journal_head(jh);
2205	}
2206	return may_free;
2207}
2208
2209/*
2210 * jbd2_journal_invalidate_folio
2211 *
2212 * This code is tricky.  It has a number of cases to deal with.
2213 *
2214 * There are two invariants which this code relies on:
2215 *
2216 * i_size must be updated on disk before we start calling invalidate_folio
2217 * on the data.
2218 *
2219 *  This is done in ext3 by defining an ext3_setattr method which
2220 *  updates i_size before truncate gets going.  By maintaining this
2221 *  invariant, we can be sure that it is safe to throw away any buffers
2222 *  attached to the current transaction: once the transaction commits,
2223 *  we know that the data will not be needed.
2224 *
2225 *  Note however that we can *not* throw away data belonging to the
2226 *  previous, committing transaction!
2227 *
2228 * Any disk blocks which *are* part of the previous, committing
2229 * transaction (and which therefore cannot be discarded immediately) are
2230 * not going to be reused in the new running transaction
2231 *
2232 *  The bitmap committed_data images guarantee this: any block which is
2233 *  allocated in one transaction and removed in the next will be marked
2234 *  as in-use in the committed_data bitmap, so cannot be reused until
2235 *  the next transaction to delete the block commits.  This means that
2236 *  leaving committing buffers dirty is quite safe: the disk blocks
2237 *  cannot be reallocated to a different file and so buffer aliasing is
2238 *  not possible.
2239 *
2240 *
2241 * The above applies mainly to ordered data mode.  In writeback mode we
2242 * don't make guarantees about the order in which data hits disk --- in
2243 * particular we don't guarantee that new dirty data is flushed before
2244 * transaction commit --- so it is always safe just to discard data
2245 * immediately in that mode.  --sct
2246 */
2247
2248/*
2249 * The journal_unmap_buffer helper function returns zero if the buffer
2250 * concerned remains pinned as an anonymous buffer belonging to an older
2251 * transaction.
2252 *
2253 * We're outside-transaction here.  Either or both of j_running_transaction
2254 * and j_committing_transaction may be NULL.
2255 */
2256static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
2257				int partial_page)
2258{
2259	transaction_t *transaction;
2260	struct journal_head *jh;
2261	int may_free = 1;
2262
2263	BUFFER_TRACE(bh, "entry");
2264
2265	/*
2266	 * It is safe to proceed here without the j_list_lock because the
2267	 * buffers cannot be stolen by try_to_free_buffers as long as we are
2268	 * holding the page lock. --sct
2269	 */
2270
2271	jh = jbd2_journal_grab_journal_head(bh);
2272	if (!jh)
2273		goto zap_buffer_unlocked;
2274
2275	/* OK, we have data buffer in journaled mode */
2276	write_lock(&journal->j_state_lock);
2277	spin_lock(&jh->b_state_lock);
2278	spin_lock(&journal->j_list_lock);
2279
2280	/*
2281	 * We cannot remove the buffer from checkpoint lists until the
2282	 * transaction adding inode to orphan list (let's call it T)
2283	 * is committed.  Otherwise if the transaction changing the
2284	 * buffer would be cleaned from the journal before T is
2285	 * committed, a crash will cause that the correct contents of
2286	 * the buffer will be lost.  On the other hand we have to
2287	 * clear the buffer dirty bit at latest at the moment when the
2288	 * transaction marking the buffer as freed in the filesystem
2289	 * structures is committed because from that moment on the
2290	 * block can be reallocated and used by a different page.
2291	 * Since the block hasn't been freed yet but the inode has
2292	 * already been added to orphan list, it is safe for us to add
2293	 * the buffer to BJ_Forget list of the newest transaction.
2294	 *
2295	 * Also we have to clear buffer_mapped flag of a truncated buffer
2296	 * because the buffer_head may be attached to the page straddling
2297	 * i_size (can happen only when blocksize < pagesize) and thus the
2298	 * buffer_head can be reused when the file is extended again. So we end
2299	 * up keeping around invalidated buffers attached to transactions'
2300	 * BJ_Forget list just to stop checkpointing code from cleaning up
2301	 * the transaction this buffer was modified in.
2302	 */
2303	transaction = jh->b_transaction;
2304	if (transaction == NULL) {
2305		/* First case: not on any transaction.  If it
2306		 * has no checkpoint link, then we can zap it:
2307		 * it's a writeback-mode buffer so we don't care
2308		 * if it hits disk safely. */
2309		if (!jh->b_cp_transaction) {
2310			JBUFFER_TRACE(jh, "not on any transaction: zap");
2311			goto zap_buffer;
2312		}
2313
2314		if (!buffer_dirty(bh)) {
2315			/* bdflush has written it.  We can drop it now */
2316			__jbd2_journal_remove_checkpoint(jh);
2317			goto zap_buffer;
2318		}
2319
2320		/* OK, it must be in the journal but still not
2321		 * written fully to disk: it's metadata or
2322		 * journaled data... */
2323
2324		if (journal->j_running_transaction) {
2325			/* ... and once the current transaction has
2326			 * committed, the buffer won't be needed any
2327			 * longer. */
2328			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
2329			may_free = __dispose_buffer(jh,
2330					journal->j_running_transaction);
2331			goto zap_buffer;
2332		} else {
2333			/* There is no currently-running transaction. So the
2334			 * orphan record which we wrote for this file must have
2335			 * passed into commit.  We must attach this buffer to
2336			 * the committing transaction, if it exists. */
2337			if (journal->j_committing_transaction) {
2338				JBUFFER_TRACE(jh, "give to committing trans");
2339				may_free = __dispose_buffer(jh,
2340					journal->j_committing_transaction);
2341				goto zap_buffer;
2342			} else {
2343				/* The orphan record's transaction has
2344				 * committed.  We can cleanse this buffer */
2345				clear_buffer_jbddirty(bh);
2346				__jbd2_journal_remove_checkpoint(jh);
2347				goto zap_buffer;
2348			}
2349		}
2350	} else if (transaction == journal->j_committing_transaction) {
2351		JBUFFER_TRACE(jh, "on committing transaction");
2352		/*
2353		 * The buffer is committing, we simply cannot touch
2354		 * it. If the page is straddling i_size we have to wait
2355		 * for commit and try again.
2356		 */
2357		if (partial_page) {
2358			spin_unlock(&journal->j_list_lock);
2359			spin_unlock(&jh->b_state_lock);
2360			write_unlock(&journal->j_state_lock);
2361			jbd2_journal_put_journal_head(jh);
2362			/* Already zapped buffer? Nothing to do... */
2363			if (!bh->b_bdev)
2364				return 0;
2365			return -EBUSY;
2366		}
2367		/*
2368		 * OK, buffer won't be reachable after truncate. We just clear
2369		 * b_modified to not confuse transaction credit accounting, and
2370		 * set j_next_transaction to the running transaction (if there
2371		 * is one) and mark buffer as freed so that commit code knows
2372		 * it should clear dirty bits when it is done with the buffer.
2373		 */
2374		set_buffer_freed(bh);
2375		if (journal->j_running_transaction && buffer_jbddirty(bh))
2376			jh->b_next_transaction = journal->j_running_transaction;
2377		jh->b_modified = 0;
2378		spin_unlock(&journal->j_list_lock);
2379		spin_unlock(&jh->b_state_lock);
2380		write_unlock(&journal->j_state_lock);
2381		jbd2_journal_put_journal_head(jh);
2382		return 0;
2383	} else {
2384		/* Good, the buffer belongs to the running transaction.
2385		 * We are writing our own transaction's data, not any
2386		 * previous one's, so it is safe to throw it away
2387		 * (remember that we expect the filesystem to have set
2388		 * i_size already for this truncate so recovery will not
2389		 * expose the disk blocks we are discarding here.) */
2390		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
2391		JBUFFER_TRACE(jh, "on running transaction");
2392		may_free = __dispose_buffer(jh, transaction);
2393	}
2394
2395zap_buffer:
2396	/*
2397	 * This is tricky. Although the buffer is truncated, it may be reused
2398	 * if blocksize < pagesize and it is attached to the page straddling
2399	 * EOF. Since the buffer might have been added to BJ_Forget list of the
2400	 * running transaction, journal_get_write_access() won't clear
2401	 * b_modified and credit accounting gets confused. So clear b_modified
2402	 * here.
2403	 */
2404	jh->b_modified = 0;
2405	spin_unlock(&journal->j_list_lock);
2406	spin_unlock(&jh->b_state_lock);
2407	write_unlock(&journal->j_state_lock);
2408	jbd2_journal_put_journal_head(jh);
2409zap_buffer_unlocked:
2410	clear_buffer_dirty(bh);
2411	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2412	clear_buffer_mapped(bh);
2413	clear_buffer_req(bh);
2414	clear_buffer_new(bh);
2415	clear_buffer_delay(bh);
2416	clear_buffer_unwritten(bh);
2417	bh->b_bdev = NULL;
2418	return may_free;
2419}
2420
2421/**
2422 * jbd2_journal_invalidate_folio()
2423 * @journal: journal to use for flush...
2424 * @folio:    folio to flush
2425 * @offset:  start of the range to invalidate
2426 * @length:  length of the range to invalidate
2427 *
2428 * Reap page buffers containing data after in the specified range in page.
2429 * Can return -EBUSY if buffers are part of the committing transaction and
2430 * the page is straddling i_size. Caller then has to wait for current commit
2431 * and try again.
2432 */
2433int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
2434				size_t offset, size_t length)
2435{
2436	struct buffer_head *head, *bh, *next;
2437	unsigned int stop = offset + length;
2438	unsigned int curr_off = 0;
2439	int partial_page = (offset || length < folio_size(folio));
2440	int may_free = 1;
2441	int ret = 0;
2442
2443	if (!folio_test_locked(folio))
2444		BUG();
2445	head = folio_buffers(folio);
2446	if (!head)
2447		return 0;
2448
2449	BUG_ON(stop > folio_size(folio) || stop < length);
2450
2451	/* We will potentially be playing with lists other than just the
2452	 * data lists (especially for journaled data mode), so be
2453	 * cautious in our locking. */
2454
2455	bh = head;
2456	do {
2457		unsigned int next_off = curr_off + bh->b_size;
2458		next = bh->b_this_page;
2459
2460		if (next_off > stop)
2461			return 0;
2462
2463		if (offset <= curr_off) {
2464			/* This block is wholly outside the truncation point */
2465			lock_buffer(bh);
2466			ret = journal_unmap_buffer(journal, bh, partial_page);
2467			unlock_buffer(bh);
2468			if (ret < 0)
2469				return ret;
2470			may_free &= ret;
2471		}
2472		curr_off = next_off;
2473		bh = next;
2474
2475	} while (bh != head);
2476
2477	if (!partial_page) {
2478		if (may_free && try_to_free_buffers(folio))
2479			J_ASSERT(!folio_buffers(folio));
2480	}
2481	return 0;
2482}
2483
2484/*
2485 * File a buffer on the given transaction list.
2486 */
2487void __jbd2_journal_file_buffer(struct journal_head *jh,
2488			transaction_t *transaction, int jlist)
2489{
2490	struct journal_head **list = NULL;
2491	int was_dirty = 0;
2492	struct buffer_head *bh = jh2bh(jh);
2493
2494	lockdep_assert_held(&jh->b_state_lock);
2495	assert_spin_locked(&transaction->t_journal->j_list_lock);
2496
2497	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2498	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2499				jh->b_transaction == NULL);
2500
2501	if (jh->b_transaction && jh->b_jlist == jlist)
2502		return;
2503
2504	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2505	    jlist == BJ_Shadow || jlist == BJ_Forget) {
2506		/*
2507		 * For metadata buffers, we track dirty bit in buffer_jbddirty
2508		 * instead of buffer_dirty. We should not see a dirty bit set
2509		 * here because we clear it in do_get_write_access but e.g.
2510		 * tune2fs can modify the sb and set the dirty bit at any time
2511		 * so we try to gracefully handle that.
2512		 */
2513		if (buffer_dirty(bh))
2514			warn_dirty_buffer(bh);
2515		if (test_clear_buffer_dirty(bh) ||
2516		    test_clear_buffer_jbddirty(bh))
2517			was_dirty = 1;
2518	}
2519
2520	if (jh->b_transaction)
2521		__jbd2_journal_temp_unlink_buffer(jh);
2522	else
2523		jbd2_journal_grab_journal_head(bh);
2524	jh->b_transaction = transaction;
2525
2526	switch (jlist) {
2527	case BJ_None:
2528		J_ASSERT_JH(jh, !jh->b_committed_data);
2529		J_ASSERT_JH(jh, !jh->b_frozen_data);
2530		return;
2531	case BJ_Metadata:
2532		transaction->t_nr_buffers++;
2533		list = &transaction->t_buffers;
2534		break;
2535	case BJ_Forget:
2536		list = &transaction->t_forget;
2537		break;
2538	case BJ_Shadow:
2539		list = &transaction->t_shadow_list;
2540		break;
2541	case BJ_Reserved:
2542		list = &transaction->t_reserved_list;
2543		break;
2544	}
2545
2546	__blist_add_buffer(list, jh);
2547	jh->b_jlist = jlist;
2548
2549	if (was_dirty)
2550		set_buffer_jbddirty(bh);
2551}
2552
2553void jbd2_journal_file_buffer(struct journal_head *jh,
2554				transaction_t *transaction, int jlist)
2555{
2556	spin_lock(&jh->b_state_lock);
2557	spin_lock(&transaction->t_journal->j_list_lock);
2558	__jbd2_journal_file_buffer(jh, transaction, jlist);
2559	spin_unlock(&transaction->t_journal->j_list_lock);
2560	spin_unlock(&jh->b_state_lock);
2561}
2562
2563/*
2564 * Remove a buffer from its current buffer list in preparation for
2565 * dropping it from its current transaction entirely.  If the buffer has
2566 * already started to be used by a subsequent transaction, refile the
2567 * buffer on that transaction's metadata list.
2568 *
2569 * Called under j_list_lock
2570 * Called under jh->b_state_lock
2571 *
2572 * When this function returns true, there's no next transaction to refile to
2573 * and the caller has to drop jh reference through
2574 * jbd2_journal_put_journal_head().
2575 */
2576bool __jbd2_journal_refile_buffer(struct journal_head *jh)
2577{
2578	int was_dirty, jlist;
2579	struct buffer_head *bh = jh2bh(jh);
2580
2581	lockdep_assert_held(&jh->b_state_lock);
2582	if (jh->b_transaction)
2583		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2584
2585	/* If the buffer is now unused, just drop it. */
2586	if (jh->b_next_transaction == NULL) {
2587		__jbd2_journal_unfile_buffer(jh);
2588		return true;
2589	}
2590
2591	/*
2592	 * It has been modified by a later transaction: add it to the new
2593	 * transaction's metadata list.
2594	 */
2595
2596	was_dirty = test_clear_buffer_jbddirty(bh);
2597	__jbd2_journal_temp_unlink_buffer(jh);
2598
2599	/*
2600	 * b_transaction must be set, otherwise the new b_transaction won't
2601	 * be holding jh reference
2602	 */
2603	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2604
2605	/*
2606	 * We set b_transaction here because b_next_transaction will inherit
2607	 * our jh reference and thus __jbd2_journal_file_buffer() must not
2608	 * take a new one.
2609	 */
2610	WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
2611	WRITE_ONCE(jh->b_next_transaction, NULL);
2612	if (buffer_freed(bh))
2613		jlist = BJ_Forget;
2614	else if (jh->b_modified)
2615		jlist = BJ_Metadata;
2616	else
2617		jlist = BJ_Reserved;
2618	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
2619	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2620
2621	if (was_dirty)
2622		set_buffer_jbddirty(bh);
2623	return false;
2624}
2625
2626/*
2627 * __jbd2_journal_refile_buffer() with necessary locking added. We take our
2628 * bh reference so that we can safely unlock bh.
2629 *
2630 * The jh and bh may be freed by this call.
2631 */
2632void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2633{
2634	bool drop;
2635
2636	spin_lock(&jh->b_state_lock);
2637	spin_lock(&journal->j_list_lock);
2638	drop = __jbd2_journal_refile_buffer(jh);
2639	spin_unlock(&jh->b_state_lock);
2640	spin_unlock(&journal->j_list_lock);
2641	if (drop)
2642		jbd2_journal_put_journal_head(jh);
2643}
2644
2645/*
2646 * File inode in the inode list of the handle's transaction
2647 */
2648static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
2649		unsigned long flags, loff_t start_byte, loff_t end_byte)
2650{
2651	transaction_t *transaction = handle->h_transaction;
2652	journal_t *journal;
2653
2654	if (is_handle_aborted(handle))
2655		return -EROFS;
2656	journal = transaction->t_journal;
2657
2658	jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2659			transaction->t_tid);
2660
2661	spin_lock(&journal->j_list_lock);
2662	jinode->i_flags |= flags;
2663
2664	if (jinode->i_dirty_end) {
2665		jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
2666		jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
2667	} else {
2668		jinode->i_dirty_start = start_byte;
2669		jinode->i_dirty_end = end_byte;
2670	}
2671
2672	/* Is inode already attached where we need it? */
2673	if (jinode->i_transaction == transaction ||
2674	    jinode->i_next_transaction == transaction)
2675		goto done;
2676
2677	/*
2678	 * We only ever set this variable to 1 so the test is safe. Since
2679	 * t_need_data_flush is likely to be set, we do the test to save some
2680	 * cacheline bouncing
2681	 */
2682	if (!transaction->t_need_data_flush)
2683		transaction->t_need_data_flush = 1;
2684	/* On some different transaction's list - should be
2685	 * the committing one */
2686	if (jinode->i_transaction) {
2687		J_ASSERT(jinode->i_next_transaction == NULL);
2688		J_ASSERT(jinode->i_transaction ==
2689					journal->j_committing_transaction);
2690		jinode->i_next_transaction = transaction;
2691		goto done;
2692	}
2693	/* Not on any transaction list... */
2694	J_ASSERT(!jinode->i_next_transaction);
2695	jinode->i_transaction = transaction;
2696	list_add(&jinode->i_list, &transaction->t_inode_list);
2697done:
2698	spin_unlock(&journal->j_list_lock);
2699
2700	return 0;
2701}
2702
2703int jbd2_journal_inode_ranged_write(handle_t *handle,
2704		struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
2705{
2706	return jbd2_journal_file_inode(handle, jinode,
2707			JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
2708			start_byte + length - 1);
2709}
2710
2711int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
2712		loff_t start_byte, loff_t length)
2713{
2714	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
2715			start_byte, start_byte + length - 1);
2716}
2717
2718/*
2719 * File truncate and transaction commit interact with each other in a
2720 * non-trivial way.  If a transaction writing data block A is
2721 * committing, we cannot discard the data by truncate until we have
2722 * written them.  Otherwise if we crashed after the transaction with
2723 * write has committed but before the transaction with truncate has
2724 * committed, we could see stale data in block A.  This function is a
2725 * helper to solve this problem.  It starts writeout of the truncated
2726 * part in case it is in the committing transaction.
2727 *
2728 * Filesystem code must call this function when inode is journaled in
2729 * ordered mode before truncation happens and after the inode has been
2730 * placed on orphan list with the new inode size. The second condition
2731 * avoids the race that someone writes new data and we start
2732 * committing the transaction after this function has been called but
2733 * before a transaction for truncate is started (and furthermore it
2734 * allows us to optimize the case where the addition to orphan list
2735 * happens in the same transaction as write --- we don't have to write
2736 * any data in such case).
2737 */
2738int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2739					struct jbd2_inode *jinode,
2740					loff_t new_size)
2741{
2742	transaction_t *inode_trans, *commit_trans;
2743	int ret = 0;
2744
2745	/* This is a quick check to avoid locking if not necessary */
2746	if (!jinode->i_transaction)
2747		goto out;
2748	/* Locks are here just to force reading of recent values, it is
2749	 * enough that the transaction was not committing before we started
2750	 * a transaction adding the inode to orphan list */
2751	read_lock(&journal->j_state_lock);
2752	commit_trans = journal->j_committing_transaction;
2753	read_unlock(&journal->j_state_lock);
2754	spin_lock(&journal->j_list_lock);
2755	inode_trans = jinode->i_transaction;
2756	spin_unlock(&journal->j_list_lock);
2757	if (inode_trans == commit_trans) {
2758		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
2759			new_size, LLONG_MAX);
2760		if (ret)
2761			jbd2_journal_abort(journal, ret);
2762	}
2763out:
2764	return ret;
2765}
2766