xref: /kernel/linux/linux-5.10/fs/jbd2/commit.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * linux/fs/jbd2/commit.c
4 *
5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 *
7 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 *
9 * Journal commit routines for the generic filesystem journaling code;
10 * part of the ext2fs journaling system.
11 */
12
13#include <linux/time.h>
14#include <linux/fs.h>
15#include <linux/jbd2.h>
16#include <linux/errno.h>
17#include <linux/slab.h>
18#include <linux/mm.h>
19#include <linux/pagemap.h>
20#include <linux/jiffies.h>
21#include <linux/crc32.h>
22#include <linux/writeback.h>
23#include <linux/backing-dev.h>
24#include <linux/bio.h>
25#include <linux/blkdev.h>
26#include <linux/bitops.h>
27#include <trace/events/jbd2.h>
28
29/*
30 * IO end handler for temporary buffer_heads handling writes to the journal.
31 */
32static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33{
34	struct buffer_head *orig_bh = bh->b_private;
35
36	BUFFER_TRACE(bh, "");
37	if (uptodate)
38		set_buffer_uptodate(bh);
39	else
40		clear_buffer_uptodate(bh);
41	if (orig_bh) {
42		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43		smp_mb__after_atomic();
44		wake_up_bit(&orig_bh->b_state, BH_Shadow);
45	}
46	unlock_buffer(bh);
47}
48
49/*
50 * When an ext4 file is truncated, it is possible that some pages are not
51 * successfully freed, because they are attached to a committing transaction.
52 * After the transaction commits, these pages are left on the LRU, with no
53 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54 * by the VM, but their apparent absence upsets the VM accounting, and it makes
55 * the numbers in /proc/meminfo look odd.
56 *
57 * So here, we have a buffer which has just come off the forget list.  Look to
58 * see if we can strip all buffers from the backing page.
59 *
60 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61 * caller provided us with a ref against the buffer, and we drop that here.
62 */
63static void release_buffer_page(struct buffer_head *bh)
64{
65	struct page *page;
66
67	if (buffer_dirty(bh))
68		goto nope;
69	if (atomic_read(&bh->b_count) != 1)
70		goto nope;
71	page = bh->b_page;
72	if (!page)
73		goto nope;
74	if (page->mapping)
75		goto nope;
76
77	/* OK, it's a truncated page */
78	if (!trylock_page(page))
79		goto nope;
80
81	get_page(page);
82	__brelse(bh);
83	try_to_free_buffers(page);
84	unlock_page(page);
85	put_page(page);
86	return;
87
88nope:
89	__brelse(bh);
90}
91
92static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
93{
94	struct commit_header *h;
95	__u32 csum;
96
97	if (!jbd2_journal_has_csum_v2or3(j))
98		return;
99
100	h = (struct commit_header *)(bh->b_data);
101	h->h_chksum_type = 0;
102	h->h_chksum_size = 0;
103	h->h_chksum[0] = 0;
104	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
105	h->h_chksum[0] = cpu_to_be32(csum);
106}
107
108/*
109 * Done it all: now submit the commit record.  We should have
110 * cleaned up our previous buffers by now, so if we are in abort
111 * mode we can now just skip the rest of the journal write
112 * entirely.
113 *
114 * Returns 1 if the journal needs to be aborted or 0 on success
115 */
116static int journal_submit_commit_record(journal_t *journal,
117					transaction_t *commit_transaction,
118					struct buffer_head **cbh,
119					__u32 crc32_sum)
120{
121	struct commit_header *tmp;
122	struct buffer_head *bh;
123	int ret;
124	struct timespec64 now;
125
126	*cbh = NULL;
127
128	if (is_journal_aborted(journal))
129		return 0;
130
131	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
132						JBD2_COMMIT_BLOCK);
133	if (!bh)
134		return 1;
135
136	tmp = (struct commit_header *)bh->b_data;
137	ktime_get_coarse_real_ts64(&now);
138	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
139	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
140
141	if (jbd2_has_feature_checksum(journal)) {
142		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
143		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
144		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
145	}
146	jbd2_commit_block_csum_set(journal, bh);
147
148	BUFFER_TRACE(bh, "submit commit block");
149	lock_buffer(bh);
150	clear_buffer_dirty(bh);
151	set_buffer_uptodate(bh);
152	bh->b_end_io = journal_end_buffer_io_sync;
153
154	if (journal->j_flags & JBD2_BARRIER &&
155	    !jbd2_has_feature_async_commit(journal))
156		ret = submit_bh(REQ_OP_WRITE,
157			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
158	else
159		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
160
161	*cbh = bh;
162	return ret;
163}
164
165/*
166 * This function along with journal_submit_commit_record
167 * allows to write the commit record asynchronously.
168 */
169static int journal_wait_on_commit_record(journal_t *journal,
170					 struct buffer_head *bh)
171{
172	int ret = 0;
173
174	clear_buffer_dirty(bh);
175	wait_on_buffer(bh);
176
177	if (unlikely(!buffer_uptodate(bh)))
178		ret = -EIO;
179	put_bh(bh);            /* One for getblk() */
180
181	return ret;
182}
183
184/*
185 * write the filemap data using writepage() address_space_operations.
186 * We don't do block allocation here even for delalloc. We don't
187 * use writepages() because with delayed allocation we may be doing
188 * block allocation in writepages().
189 */
190int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
191{
192	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
193	struct writeback_control wbc = {
194		.sync_mode =  WB_SYNC_ALL,
195		.nr_to_write = mapping->nrpages * 2,
196		.range_start = jinode->i_dirty_start,
197		.range_end = jinode->i_dirty_end,
198	};
199
200	/*
201	 * submit the inode data buffers. We use writepage
202	 * instead of writepages. Because writepages can do
203	 * block allocation with delalloc. We need to write
204	 * only allocated blocks here.
205	 */
206	return generic_writepages(mapping, &wbc);
207}
208
209/* Send all the data buffers related to an inode */
210int jbd2_submit_inode_data(struct jbd2_inode *jinode)
211{
212
213	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
214		return 0;
215
216	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
217	return jbd2_journal_submit_inode_data_buffers(jinode);
218
219}
220EXPORT_SYMBOL(jbd2_submit_inode_data);
221
222int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
223{
224	if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
225		!jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
226		return 0;
227	return filemap_fdatawait_range_keep_errors(
228		jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
229		jinode->i_dirty_end);
230}
231EXPORT_SYMBOL(jbd2_wait_inode_data);
232
233/*
234 * Submit all the data buffers of inode associated with the transaction to
235 * disk.
236 *
237 * We are in a committing transaction. Therefore no new inode can be added to
238 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
239 * operate on from being released while we write out pages.
240 */
241static int journal_submit_data_buffers(journal_t *journal,
242		transaction_t *commit_transaction)
243{
244	struct jbd2_inode *jinode;
245	int err, ret = 0;
246
247	spin_lock(&journal->j_list_lock);
248	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
249		if (!(jinode->i_flags & JI_WRITE_DATA))
250			continue;
251		jinode->i_flags |= JI_COMMIT_RUNNING;
252		spin_unlock(&journal->j_list_lock);
253		/* submit the inode data buffers. */
254		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
255		if (journal->j_submit_inode_data_buffers) {
256			err = journal->j_submit_inode_data_buffers(jinode);
257			if (!ret)
258				ret = err;
259		}
260		spin_lock(&journal->j_list_lock);
261		J_ASSERT(jinode->i_transaction == commit_transaction);
262		jinode->i_flags &= ~JI_COMMIT_RUNNING;
263		smp_mb();
264		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
265	}
266	spin_unlock(&journal->j_list_lock);
267	return ret;
268}
269
270int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
271{
272	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
273
274	return filemap_fdatawait_range_keep_errors(mapping,
275						   jinode->i_dirty_start,
276						   jinode->i_dirty_end);
277}
278
279/*
280 * Wait for data submitted for writeout, refile inodes to proper
281 * transaction if needed.
282 *
283 */
284static int journal_finish_inode_data_buffers(journal_t *journal,
285		transaction_t *commit_transaction)
286{
287	struct jbd2_inode *jinode, *next_i;
288	int err, ret = 0;
289
290	/* For locking, see the comment in journal_submit_data_buffers() */
291	spin_lock(&journal->j_list_lock);
292	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
293		if (!(jinode->i_flags & JI_WAIT_DATA))
294			continue;
295		jinode->i_flags |= JI_COMMIT_RUNNING;
296		spin_unlock(&journal->j_list_lock);
297		/* wait for the inode data buffers writeout. */
298		if (journal->j_finish_inode_data_buffers) {
299			err = journal->j_finish_inode_data_buffers(jinode);
300			if (!ret)
301				ret = err;
302		}
303		cond_resched();
304		spin_lock(&journal->j_list_lock);
305		jinode->i_flags &= ~JI_COMMIT_RUNNING;
306		smp_mb();
307		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
308	}
309
310	/* Now refile inode to proper lists */
311	list_for_each_entry_safe(jinode, next_i,
312				 &commit_transaction->t_inode_list, i_list) {
313		list_del(&jinode->i_list);
314		if (jinode->i_next_transaction) {
315			jinode->i_transaction = jinode->i_next_transaction;
316			jinode->i_next_transaction = NULL;
317			list_add(&jinode->i_list,
318				&jinode->i_transaction->t_inode_list);
319		} else {
320			jinode->i_transaction = NULL;
321			jinode->i_dirty_start = 0;
322			jinode->i_dirty_end = 0;
323		}
324	}
325	spin_unlock(&journal->j_list_lock);
326
327	return ret;
328}
329
330static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
331{
332	struct page *page = bh->b_page;
333	char *addr;
334	__u32 checksum;
335
336	addr = kmap_atomic(page);
337	checksum = crc32_be(crc32_sum,
338		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
339	kunmap_atomic(addr);
340
341	return checksum;
342}
343
344static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
345				   unsigned long long block)
346{
347	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
348	if (jbd2_has_feature_64bit(j))
349		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
350}
351
352static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
353				    struct buffer_head *bh, __u32 sequence)
354{
355	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
356	struct page *page = bh->b_page;
357	__u8 *addr;
358	__u32 csum32;
359	__be32 seq;
360
361	if (!jbd2_journal_has_csum_v2or3(j))
362		return;
363
364	seq = cpu_to_be32(sequence);
365	addr = kmap_atomic(page);
366	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
367	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
368			     bh->b_size);
369	kunmap_atomic(addr);
370
371	if (jbd2_has_feature_csum3(j))
372		tag3->t_checksum = cpu_to_be32(csum32);
373	else
374		tag->t_checksum = cpu_to_be16(csum32);
375}
376/*
377 * jbd2_journal_commit_transaction
378 *
379 * The primary function for committing a transaction to the log.  This
380 * function is called by the journal thread to begin a complete commit.
381 */
382void jbd2_journal_commit_transaction(journal_t *journal)
383{
384	struct transaction_stats_s stats;
385	transaction_t *commit_transaction;
386	struct journal_head *jh;
387	struct buffer_head *descriptor;
388	struct buffer_head **wbuf = journal->j_wbuf;
389	int bufs;
390	int flags;
391	int err;
392	unsigned long long blocknr;
393	ktime_t start_time;
394	u64 commit_time;
395	char *tagp = NULL;
396	journal_block_tag_t *tag = NULL;
397	int space_left = 0;
398	int first_tag = 0;
399	int tag_flag;
400	int i;
401	int tag_bytes = journal_tag_bytes(journal);
402	struct buffer_head *cbh = NULL; /* For transactional checksums */
403	__u32 crc32_sum = ~0;
404	struct blk_plug plug;
405	/* Tail of the journal */
406	unsigned long first_block;
407	tid_t first_tid;
408	int update_tail;
409	int csum_size = 0;
410	LIST_HEAD(io_bufs);
411	LIST_HEAD(log_bufs);
412
413	if (jbd2_journal_has_csum_v2or3(journal))
414		csum_size = sizeof(struct jbd2_journal_block_tail);
415
416	/*
417	 * First job: lock down the current transaction and wait for
418	 * all outstanding updates to complete.
419	 */
420
421	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
422	if (journal->j_flags & JBD2_FLUSHED) {
423		jbd_debug(3, "super block updated\n");
424		mutex_lock_io(&journal->j_checkpoint_mutex);
425		/*
426		 * We hold j_checkpoint_mutex so tail cannot change under us.
427		 * We don't need any special data guarantees for writing sb
428		 * since journal is empty and it is ok for write to be
429		 * flushed only with transaction commit.
430		 */
431		jbd2_journal_update_sb_log_tail(journal,
432						journal->j_tail_sequence,
433						journal->j_tail,
434						REQ_SYNC);
435		mutex_unlock(&journal->j_checkpoint_mutex);
436	} else {
437		jbd_debug(3, "superblock not updated\n");
438	}
439
440	J_ASSERT(journal->j_running_transaction != NULL);
441	J_ASSERT(journal->j_committing_transaction == NULL);
442
443	write_lock(&journal->j_state_lock);
444	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
445	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
446		DEFINE_WAIT(wait);
447
448		prepare_to_wait(&journal->j_fc_wait, &wait,
449				TASK_UNINTERRUPTIBLE);
450		write_unlock(&journal->j_state_lock);
451		schedule();
452		write_lock(&journal->j_state_lock);
453		finish_wait(&journal->j_fc_wait, &wait);
454		/*
455		 * TODO: by blocking fast commits here, we are increasing
456		 * fsync() latency slightly. Strictly speaking, we don't need
457		 * to block fast commits until the transaction enters T_FLUSH
458		 * state. So an optimization is possible where we block new fast
459		 * commits here and wait for existing ones to complete
460		 * just before we enter T_FLUSH. That way, the existing fast
461		 * commits and this full commit can proceed parallely.
462		 */
463	}
464	write_unlock(&journal->j_state_lock);
465
466	commit_transaction = journal->j_running_transaction;
467
468	trace_jbd2_start_commit(journal, commit_transaction);
469	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
470			commit_transaction->t_tid);
471
472	write_lock(&journal->j_state_lock);
473	journal->j_fc_off = 0;
474	J_ASSERT(commit_transaction->t_state == T_RUNNING);
475	commit_transaction->t_state = T_LOCKED;
476
477	trace_jbd2_commit_locking(journal, commit_transaction);
478	stats.run.rs_wait = commit_transaction->t_max_wait;
479	stats.run.rs_request_delay = 0;
480	stats.run.rs_locked = jiffies;
481	if (commit_transaction->t_requested)
482		stats.run.rs_request_delay =
483			jbd2_time_diff(commit_transaction->t_requested,
484				       stats.run.rs_locked);
485	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
486					      stats.run.rs_locked);
487
488	spin_lock(&commit_transaction->t_handle_lock);
489	while (atomic_read(&commit_transaction->t_updates)) {
490		DEFINE_WAIT(wait);
491
492		prepare_to_wait(&journal->j_wait_updates, &wait,
493					TASK_UNINTERRUPTIBLE);
494		if (atomic_read(&commit_transaction->t_updates)) {
495			spin_unlock(&commit_transaction->t_handle_lock);
496			write_unlock(&journal->j_state_lock);
497			schedule();
498			write_lock(&journal->j_state_lock);
499			spin_lock(&commit_transaction->t_handle_lock);
500		}
501		finish_wait(&journal->j_wait_updates, &wait);
502	}
503	spin_unlock(&commit_transaction->t_handle_lock);
504	commit_transaction->t_state = T_SWITCH;
505
506	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
507			journal->j_max_transaction_buffers);
508
509	/*
510	 * First thing we are allowed to do is to discard any remaining
511	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
512	 * that there are no such buffers: if a large filesystem
513	 * operation like a truncate needs to split itself over multiple
514	 * transactions, then it may try to do a jbd2_journal_restart() while
515	 * there are still BJ_Reserved buffers outstanding.  These must
516	 * be released cleanly from the current transaction.
517	 *
518	 * In this case, the filesystem must still reserve write access
519	 * again before modifying the buffer in the new transaction, but
520	 * we do not require it to remember exactly which old buffers it
521	 * has reserved.  This is consistent with the existing behaviour
522	 * that multiple jbd2_journal_get_write_access() calls to the same
523	 * buffer are perfectly permissible.
524	 * We use journal->j_state_lock here to serialize processing of
525	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
526	 */
527	while (commit_transaction->t_reserved_list) {
528		jh = commit_transaction->t_reserved_list;
529		JBUFFER_TRACE(jh, "reserved, unused: refile");
530		/*
531		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
532		 * leave undo-committed data.
533		 */
534		if (jh->b_committed_data) {
535			struct buffer_head *bh = jh2bh(jh);
536
537			spin_lock(&jh->b_state_lock);
538			jbd2_free(jh->b_committed_data, bh->b_size);
539			jh->b_committed_data = NULL;
540			spin_unlock(&jh->b_state_lock);
541		}
542		jbd2_journal_refile_buffer(journal, jh);
543	}
544
545	write_unlock(&journal->j_state_lock);
546	/*
547	 * Now try to drop any written-back buffers from the journal's
548	 * checkpoint lists.  We do this *before* commit because it potentially
549	 * frees some memory
550	 */
551	spin_lock(&journal->j_list_lock);
552	__jbd2_journal_clean_checkpoint_list(journal, false);
553	spin_unlock(&journal->j_list_lock);
554
555	jbd_debug(3, "JBD2: commit phase 1\n");
556
557	/*
558	 * Clear revoked flag to reflect there is no revoked buffers
559	 * in the next transaction which is going to be started.
560	 */
561	jbd2_clear_buffer_revoked_flags(journal);
562
563	/*
564	 * Switch to a new revoke table.
565	 */
566	jbd2_journal_switch_revoke_table(journal);
567
568	write_lock(&journal->j_state_lock);
569	/*
570	 * Reserved credits cannot be claimed anymore, free them
571	 */
572	atomic_sub(atomic_read(&journal->j_reserved_credits),
573		   &commit_transaction->t_outstanding_credits);
574
575	trace_jbd2_commit_flushing(journal, commit_transaction);
576	stats.run.rs_flushing = jiffies;
577	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
578					     stats.run.rs_flushing);
579
580	commit_transaction->t_state = T_FLUSH;
581	journal->j_committing_transaction = commit_transaction;
582	journal->j_running_transaction = NULL;
583	start_time = ktime_get();
584	commit_transaction->t_log_start = journal->j_head;
585	wake_up_all(&journal->j_wait_transaction_locked);
586	write_unlock(&journal->j_state_lock);
587
588	jbd_debug(3, "JBD2: commit phase 2a\n");
589
590	/*
591	 * Now start flushing things to disk, in the order they appear
592	 * on the transaction lists.  Data blocks go first.
593	 */
594	err = journal_submit_data_buffers(journal, commit_transaction);
595	if (err)
596		jbd2_journal_abort(journal, err);
597
598	blk_start_plug(&plug);
599	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
600
601	jbd_debug(3, "JBD2: commit phase 2b\n");
602
603	/*
604	 * Way to go: we have now written out all of the data for a
605	 * transaction!  Now comes the tricky part: we need to write out
606	 * metadata.  Loop over the transaction's entire buffer list:
607	 */
608	write_lock(&journal->j_state_lock);
609	commit_transaction->t_state = T_COMMIT;
610	write_unlock(&journal->j_state_lock);
611
612	trace_jbd2_commit_logging(journal, commit_transaction);
613	stats.run.rs_logging = jiffies;
614	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
615					       stats.run.rs_logging);
616	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
617	stats.run.rs_blocks_logged = 0;
618
619	J_ASSERT(commit_transaction->t_nr_buffers <=
620		 atomic_read(&commit_transaction->t_outstanding_credits));
621
622	err = 0;
623	bufs = 0;
624	descriptor = NULL;
625	while (commit_transaction->t_buffers) {
626
627		/* Find the next buffer to be journaled... */
628
629		jh = commit_transaction->t_buffers;
630
631		/* If we're in abort mode, we just un-journal the buffer and
632		   release it. */
633
634		if (is_journal_aborted(journal)) {
635			clear_buffer_jbddirty(jh2bh(jh));
636			JBUFFER_TRACE(jh, "journal is aborting: refile");
637			jbd2_buffer_abort_trigger(jh,
638						  jh->b_frozen_data ?
639						  jh->b_frozen_triggers :
640						  jh->b_triggers);
641			jbd2_journal_refile_buffer(journal, jh);
642			/* If that was the last one, we need to clean up
643			 * any descriptor buffers which may have been
644			 * already allocated, even if we are now
645			 * aborting. */
646			if (!commit_transaction->t_buffers)
647				goto start_journal_io;
648			continue;
649		}
650
651		/* Make sure we have a descriptor block in which to
652		   record the metadata buffer. */
653
654		if (!descriptor) {
655			J_ASSERT (bufs == 0);
656
657			jbd_debug(4, "JBD2: get descriptor\n");
658
659			descriptor = jbd2_journal_get_descriptor_buffer(
660							commit_transaction,
661							JBD2_DESCRIPTOR_BLOCK);
662			if (!descriptor) {
663				jbd2_journal_abort(journal, -EIO);
664				continue;
665			}
666
667			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
668				(unsigned long long)descriptor->b_blocknr,
669				descriptor->b_data);
670			tagp = &descriptor->b_data[sizeof(journal_header_t)];
671			space_left = descriptor->b_size -
672						sizeof(journal_header_t);
673			first_tag = 1;
674			set_buffer_jwrite(descriptor);
675			set_buffer_dirty(descriptor);
676			wbuf[bufs++] = descriptor;
677
678			/* Record it so that we can wait for IO
679                           completion later */
680			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
681			jbd2_file_log_bh(&log_bufs, descriptor);
682		}
683
684		/* Where is the buffer to be written? */
685
686		err = jbd2_journal_next_log_block(journal, &blocknr);
687		/* If the block mapping failed, just abandon the buffer
688		   and repeat this loop: we'll fall into the
689		   refile-on-abort condition above. */
690		if (err) {
691			jbd2_journal_abort(journal, err);
692			continue;
693		}
694
695		/*
696		 * start_this_handle() uses t_outstanding_credits to determine
697		 * the free space in the log.
698		 */
699		atomic_dec(&commit_transaction->t_outstanding_credits);
700
701		/* Bump b_count to prevent truncate from stumbling over
702                   the shadowed buffer!  @@@ This can go if we ever get
703                   rid of the shadow pairing of buffers. */
704		atomic_inc(&jh2bh(jh)->b_count);
705
706		/*
707		 * Make a temporary IO buffer with which to write it out
708		 * (this will requeue the metadata buffer to BJ_Shadow).
709		 */
710		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
711		JBUFFER_TRACE(jh, "ph3: write metadata");
712		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
713						jh, &wbuf[bufs], blocknr);
714		if (flags < 0) {
715			jbd2_journal_abort(journal, flags);
716			continue;
717		}
718		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
719
720		/* Record the new block's tag in the current descriptor
721                   buffer */
722
723		tag_flag = 0;
724		if (flags & 1)
725			tag_flag |= JBD2_FLAG_ESCAPE;
726		if (!first_tag)
727			tag_flag |= JBD2_FLAG_SAME_UUID;
728
729		tag = (journal_block_tag_t *) tagp;
730		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
731		tag->t_flags = cpu_to_be16(tag_flag);
732		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
733					commit_transaction->t_tid);
734		tagp += tag_bytes;
735		space_left -= tag_bytes;
736		bufs++;
737
738		if (first_tag) {
739			memcpy (tagp, journal->j_uuid, 16);
740			tagp += 16;
741			space_left -= 16;
742			first_tag = 0;
743		}
744
745		/* If there's no more to do, or if the descriptor is full,
746		   let the IO rip! */
747
748		if (bufs == journal->j_wbufsize ||
749		    commit_transaction->t_buffers == NULL ||
750		    space_left < tag_bytes + 16 + csum_size) {
751
752			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
753
754			/* Write an end-of-descriptor marker before
755                           submitting the IOs.  "tag" still points to
756                           the last tag we set up. */
757
758			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
759start_journal_io:
760			if (descriptor)
761				jbd2_descriptor_block_csum_set(journal,
762							descriptor);
763
764			for (i = 0; i < bufs; i++) {
765				struct buffer_head *bh = wbuf[i];
766				/*
767				 * Compute checksum.
768				 */
769				if (jbd2_has_feature_checksum(journal)) {
770					crc32_sum =
771					    jbd2_checksum_data(crc32_sum, bh);
772				}
773
774				lock_buffer(bh);
775				clear_buffer_dirty(bh);
776				set_buffer_uptodate(bh);
777				bh->b_end_io = journal_end_buffer_io_sync;
778				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
779			}
780			cond_resched();
781
782			/* Force a new descriptor to be generated next
783                           time round the loop. */
784			descriptor = NULL;
785			bufs = 0;
786		}
787	}
788
789	err = journal_finish_inode_data_buffers(journal, commit_transaction);
790	if (err) {
791		printk(KERN_WARNING
792			"JBD2: Detected IO errors while flushing file data "
793		       "on %s\n", journal->j_devname);
794		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
795			jbd2_journal_abort(journal, err);
796		err = 0;
797	}
798
799	/*
800	 * Get current oldest transaction in the log before we issue flush
801	 * to the filesystem device. After the flush we can be sure that
802	 * blocks of all older transactions are checkpointed to persistent
803	 * storage and we will be safe to update journal start in the
804	 * superblock with the numbers we get here.
805	 */
806	update_tail =
807		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
808
809	write_lock(&journal->j_state_lock);
810	if (update_tail) {
811		long freed = first_block - journal->j_tail;
812
813		if (first_block < journal->j_tail)
814			freed += journal->j_last - journal->j_first;
815		/* Update tail only if we free significant amount of space */
816		if (freed < jbd2_journal_get_max_txn_bufs(journal))
817			update_tail = 0;
818	}
819	J_ASSERT(commit_transaction->t_state == T_COMMIT);
820	commit_transaction->t_state = T_COMMIT_DFLUSH;
821	write_unlock(&journal->j_state_lock);
822
823	/*
824	 * If the journal is not located on the file system device,
825	 * then we must flush the file system device before we issue
826	 * the commit record
827	 */
828	if (commit_transaction->t_need_data_flush &&
829	    (journal->j_fs_dev != journal->j_dev) &&
830	    (journal->j_flags & JBD2_BARRIER))
831		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
832
833	/* Done it all: now write the commit record asynchronously. */
834	if (jbd2_has_feature_async_commit(journal)) {
835		err = journal_submit_commit_record(journal, commit_transaction,
836						 &cbh, crc32_sum);
837		if (err)
838			jbd2_journal_abort(journal, err);
839	}
840
841	blk_finish_plug(&plug);
842
843	/* Lo and behold: we have just managed to send a transaction to
844           the log.  Before we can commit it, wait for the IO so far to
845           complete.  Control buffers being written are on the
846           transaction's t_log_list queue, and metadata buffers are on
847           the io_bufs list.
848
849	   Wait for the buffers in reverse order.  That way we are
850	   less likely to be woken up until all IOs have completed, and
851	   so we incur less scheduling load.
852	*/
853
854	jbd_debug(3, "JBD2: commit phase 3\n");
855
856	while (!list_empty(&io_bufs)) {
857		struct buffer_head *bh = list_entry(io_bufs.prev,
858						    struct buffer_head,
859						    b_assoc_buffers);
860
861		wait_on_buffer(bh);
862		cond_resched();
863
864		if (unlikely(!buffer_uptodate(bh)))
865			err = -EIO;
866		jbd2_unfile_log_bh(bh);
867		stats.run.rs_blocks_logged++;
868
869		/*
870		 * The list contains temporary buffer heads created by
871		 * jbd2_journal_write_metadata_buffer().
872		 */
873		BUFFER_TRACE(bh, "dumping temporary bh");
874		__brelse(bh);
875		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
876		free_buffer_head(bh);
877
878		/* We also have to refile the corresponding shadowed buffer */
879		jh = commit_transaction->t_shadow_list->b_tprev;
880		bh = jh2bh(jh);
881		clear_buffer_jwrite(bh);
882		J_ASSERT_BH(bh, buffer_jbddirty(bh));
883		J_ASSERT_BH(bh, !buffer_shadow(bh));
884
885		/* The metadata is now released for reuse, but we need
886                   to remember it against this transaction so that when
887                   we finally commit, we can do any checkpointing
888                   required. */
889		JBUFFER_TRACE(jh, "file as BJ_Forget");
890		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
891		JBUFFER_TRACE(jh, "brelse shadowed buffer");
892		__brelse(bh);
893	}
894
895	J_ASSERT (commit_transaction->t_shadow_list == NULL);
896
897	jbd_debug(3, "JBD2: commit phase 4\n");
898
899	/* Here we wait for the revoke record and descriptor record buffers */
900	while (!list_empty(&log_bufs)) {
901		struct buffer_head *bh;
902
903		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
904		wait_on_buffer(bh);
905		cond_resched();
906
907		if (unlikely(!buffer_uptodate(bh)))
908			err = -EIO;
909
910		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
911		clear_buffer_jwrite(bh);
912		jbd2_unfile_log_bh(bh);
913		stats.run.rs_blocks_logged++;
914		__brelse(bh);		/* One for getblk */
915		/* AKPM: bforget here */
916	}
917
918	if (err)
919		jbd2_journal_abort(journal, err);
920
921	jbd_debug(3, "JBD2: commit phase 5\n");
922	write_lock(&journal->j_state_lock);
923	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
924	commit_transaction->t_state = T_COMMIT_JFLUSH;
925	write_unlock(&journal->j_state_lock);
926
927	if (!jbd2_has_feature_async_commit(journal)) {
928		err = journal_submit_commit_record(journal, commit_transaction,
929						&cbh, crc32_sum);
930		if (err)
931			jbd2_journal_abort(journal, err);
932	}
933	if (cbh)
934		err = journal_wait_on_commit_record(journal, cbh);
935	stats.run.rs_blocks_logged++;
936	if (jbd2_has_feature_async_commit(journal) &&
937	    journal->j_flags & JBD2_BARRIER) {
938		blkdev_issue_flush(journal->j_dev, GFP_NOFS);
939	}
940
941	if (err)
942		jbd2_journal_abort(journal, err);
943
944	WARN_ON_ONCE(
945		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
946
947	/*
948	 * Now disk caches for filesystem device are flushed so we are safe to
949	 * erase checkpointed transactions from the log by updating journal
950	 * superblock.
951	 */
952	if (update_tail)
953		jbd2_update_log_tail(journal, first_tid, first_block);
954
955	/* End of a transaction!  Finally, we can do checkpoint
956           processing: any buffers committed as a result of this
957           transaction can be removed from any checkpoint list it was on
958           before. */
959
960	jbd_debug(3, "JBD2: commit phase 6\n");
961
962	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
963	J_ASSERT(commit_transaction->t_buffers == NULL);
964	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
965	J_ASSERT(commit_transaction->t_shadow_list == NULL);
966
967restart_loop:
968	/*
969	 * As there are other places (journal_unmap_buffer()) adding buffers
970	 * to this list we have to be careful and hold the j_list_lock.
971	 */
972	spin_lock(&journal->j_list_lock);
973	while (commit_transaction->t_forget) {
974		transaction_t *cp_transaction;
975		struct buffer_head *bh;
976		int try_to_free = 0;
977		bool drop_ref;
978
979		jh = commit_transaction->t_forget;
980		spin_unlock(&journal->j_list_lock);
981		bh = jh2bh(jh);
982		/*
983		 * Get a reference so that bh cannot be freed before we are
984		 * done with it.
985		 */
986		get_bh(bh);
987		spin_lock(&jh->b_state_lock);
988		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
989
990		/*
991		 * If there is undo-protected committed data against
992		 * this buffer, then we can remove it now.  If it is a
993		 * buffer needing such protection, the old frozen_data
994		 * field now points to a committed version of the
995		 * buffer, so rotate that field to the new committed
996		 * data.
997		 *
998		 * Otherwise, we can just throw away the frozen data now.
999		 *
1000		 * We also know that the frozen data has already fired
1001		 * its triggers if they exist, so we can clear that too.
1002		 */
1003		if (jh->b_committed_data) {
1004			jbd2_free(jh->b_committed_data, bh->b_size);
1005			jh->b_committed_data = NULL;
1006			if (jh->b_frozen_data) {
1007				jh->b_committed_data = jh->b_frozen_data;
1008				jh->b_frozen_data = NULL;
1009				jh->b_frozen_triggers = NULL;
1010			}
1011		} else if (jh->b_frozen_data) {
1012			jbd2_free(jh->b_frozen_data, bh->b_size);
1013			jh->b_frozen_data = NULL;
1014			jh->b_frozen_triggers = NULL;
1015		}
1016
1017		spin_lock(&journal->j_list_lock);
1018		cp_transaction = jh->b_cp_transaction;
1019		if (cp_transaction) {
1020			JBUFFER_TRACE(jh, "remove from old cp transaction");
1021			cp_transaction->t_chp_stats.cs_dropped++;
1022			__jbd2_journal_remove_checkpoint(jh);
1023		}
1024
1025		/* Only re-checkpoint the buffer_head if it is marked
1026		 * dirty.  If the buffer was added to the BJ_Forget list
1027		 * by jbd2_journal_forget, it may no longer be dirty and
1028		 * there's no point in keeping a checkpoint record for
1029		 * it. */
1030
1031		/*
1032		 * A buffer which has been freed while still being journaled
1033		 * by a previous transaction, refile the buffer to BJ_Forget of
1034		 * the running transaction. If the just committed transaction
1035		 * contains "add to orphan" operation, we can completely
1036		 * invalidate the buffer now. We are rather through in that
1037		 * since the buffer may be still accessible when blocksize <
1038		 * pagesize and it is attached to the last partial page.
1039		 */
1040		if (buffer_freed(bh) && !jh->b_next_transaction) {
1041			struct address_space *mapping;
1042
1043			clear_buffer_freed(bh);
1044			clear_buffer_jbddirty(bh);
1045
1046			/*
1047			 * Block device buffers need to stay mapped all the
1048			 * time, so it is enough to clear buffer_jbddirty and
1049			 * buffer_freed bits. For the file mapping buffers (i.e.
1050			 * journalled data) we need to unmap buffer and clear
1051			 * more bits. We also need to be careful about the check
1052			 * because the data page mapping can get cleared under
1053			 * our hands. Note that if mapping == NULL, we don't
1054			 * need to make buffer unmapped because the page is
1055			 * already detached from the mapping and buffers cannot
1056			 * get reused.
1057			 */
1058			mapping = READ_ONCE(bh->b_page->mapping);
1059			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1060				clear_buffer_mapped(bh);
1061				clear_buffer_new(bh);
1062				clear_buffer_req(bh);
1063				bh->b_bdev = NULL;
1064			}
1065		}
1066
1067		if (buffer_jbddirty(bh)) {
1068			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1069			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1070			if (is_journal_aborted(journal))
1071				clear_buffer_jbddirty(bh);
1072		} else {
1073			J_ASSERT_BH(bh, !buffer_dirty(bh));
1074			/*
1075			 * The buffer on BJ_Forget list and not jbddirty means
1076			 * it has been freed by this transaction and hence it
1077			 * could not have been reallocated until this
1078			 * transaction has committed. *BUT* it could be
1079			 * reallocated once we have written all the data to
1080			 * disk and before we process the buffer on BJ_Forget
1081			 * list.
1082			 */
1083			if (!jh->b_next_transaction)
1084				try_to_free = 1;
1085		}
1086		JBUFFER_TRACE(jh, "refile or unfile buffer");
1087		drop_ref = __jbd2_journal_refile_buffer(jh);
1088		spin_unlock(&jh->b_state_lock);
1089		if (drop_ref)
1090			jbd2_journal_put_journal_head(jh);
1091		if (try_to_free)
1092			release_buffer_page(bh);	/* Drops bh reference */
1093		else
1094			__brelse(bh);
1095		cond_resched_lock(&journal->j_list_lock);
1096	}
1097	spin_unlock(&journal->j_list_lock);
1098	/*
1099	 * This is a bit sleazy.  We use j_list_lock to protect transition
1100	 * of a transaction into T_FINISHED state and calling
1101	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1102	 * other checkpointing code processing the transaction...
1103	 */
1104	write_lock(&journal->j_state_lock);
1105	spin_lock(&journal->j_list_lock);
1106	/*
1107	 * Now recheck if some buffers did not get attached to the transaction
1108	 * while the lock was dropped...
1109	 */
1110	if (commit_transaction->t_forget) {
1111		spin_unlock(&journal->j_list_lock);
1112		write_unlock(&journal->j_state_lock);
1113		goto restart_loop;
1114	}
1115
1116	/* Add the transaction to the checkpoint list
1117	 * __journal_remove_checkpoint() can not destroy transaction
1118	 * under us because it is not marked as T_FINISHED yet */
1119	if (journal->j_checkpoint_transactions == NULL) {
1120		journal->j_checkpoint_transactions = commit_transaction;
1121		commit_transaction->t_cpnext = commit_transaction;
1122		commit_transaction->t_cpprev = commit_transaction;
1123	} else {
1124		commit_transaction->t_cpnext =
1125			journal->j_checkpoint_transactions;
1126		commit_transaction->t_cpprev =
1127			commit_transaction->t_cpnext->t_cpprev;
1128		commit_transaction->t_cpnext->t_cpprev =
1129			commit_transaction;
1130		commit_transaction->t_cpprev->t_cpnext =
1131				commit_transaction;
1132	}
1133	spin_unlock(&journal->j_list_lock);
1134
1135	/* Done with this transaction! */
1136
1137	jbd_debug(3, "JBD2: commit phase 7\n");
1138
1139	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1140
1141	commit_transaction->t_start = jiffies;
1142	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1143					      commit_transaction->t_start);
1144
1145	/*
1146	 * File the transaction statistics
1147	 */
1148	stats.ts_tid = commit_transaction->t_tid;
1149	stats.run.rs_handle_count =
1150		atomic_read(&commit_transaction->t_handle_count);
1151	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1152			     commit_transaction->t_tid, &stats.run);
1153	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1154
1155	commit_transaction->t_state = T_COMMIT_CALLBACK;
1156	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1157	journal->j_commit_sequence = commit_transaction->t_tid;
1158	journal->j_committing_transaction = NULL;
1159	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1160
1161	/*
1162	 * weight the commit time higher than the average time so we don't
1163	 * react too strongly to vast changes in the commit time
1164	 */
1165	if (likely(journal->j_average_commit_time))
1166		journal->j_average_commit_time = (commit_time +
1167				journal->j_average_commit_time*3) / 4;
1168	else
1169		journal->j_average_commit_time = commit_time;
1170
1171	write_unlock(&journal->j_state_lock);
1172
1173	if (journal->j_commit_callback)
1174		journal->j_commit_callback(journal, commit_transaction);
1175	if (journal->j_fc_cleanup_callback)
1176		journal->j_fc_cleanup_callback(journal, 1);
1177
1178	trace_jbd2_end_commit(journal, commit_transaction);
1179	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1180		  journal->j_commit_sequence, journal->j_tail_sequence);
1181
1182	write_lock(&journal->j_state_lock);
1183	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1184	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1185	spin_lock(&journal->j_list_lock);
1186	commit_transaction->t_state = T_FINISHED;
1187	/* Check if the transaction can be dropped now that we are finished */
1188	if (commit_transaction->t_checkpoint_list == NULL &&
1189	    commit_transaction->t_checkpoint_io_list == NULL) {
1190		__jbd2_journal_drop_transaction(journal, commit_transaction);
1191		jbd2_journal_free_transaction(commit_transaction);
1192	}
1193	spin_unlock(&journal->j_list_lock);
1194	write_unlock(&journal->j_state_lock);
1195	wake_up(&journal->j_wait_done_commit);
1196	wake_up(&journal->j_fc_wait);
1197
1198	/*
1199	 * Calculate overall stats
1200	 */
1201	spin_lock(&journal->j_history_lock);
1202	journal->j_stats.ts_tid++;
1203	journal->j_stats.ts_requested += stats.ts_requested;
1204	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1205	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1206	journal->j_stats.run.rs_running += stats.run.rs_running;
1207	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1208	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1209	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1210	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1211	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1212	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1213	spin_unlock(&journal->j_history_lock);
1214}
1215