xref: /kernel/linux/linux-6.6/fs/reiserfs/journal.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Write ahead logging implementation copyright Chris Mason 2000
4 *
5 * The background commits make this code very interrelated, and
6 * overly complex.  I need to rethink things a bit....The major players:
7 *
8 * journal_begin -- call with the number of blocks you expect to log.
9 *                  If the current transaction is too
10 *		    old, it will block until the current transaction is
11 *		    finished, and then start a new one.
12 *		    Usually, your transaction will get joined in with
13 *                  previous ones for speed.
14 *
15 * journal_join  -- same as journal_begin, but won't block on the current
16 *                  transaction regardless of age.  Don't ever call
17 *                  this.  Ever.  There are only two places it should be
18 *                  called from, and they are both inside this file.
19 *
20 * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
21 *                       that might make them get sent to disk
22 *                       and then marks them BH_JDirty.  Puts the buffer head
23 *                       into the current transaction hash.
24 *
25 * journal_end -- if the current transaction is batchable, it does nothing
26 *                   otherwise, it could do an async/synchronous commit, or
27 *                   a full flush of all log and real blocks in the
28 *                   transaction.
29 *
30 * flush_old_commits -- if the current transaction is too old, it is ended and
31 *                      commit blocks are sent to disk.  Forces commit blocks
32 *                      to disk for all backgrounded commits that have been
33 *                      around too long.
34 *		     -- Note, if you call this as an immediate flush from
35 *		        within kupdate, it will ignore the immediate flag
36 */
37
38#include <linux/time.h>
39#include <linux/semaphore.h>
40#include <linux/vmalloc.h>
41#include "reiserfs.h"
42#include <linux/kernel.h>
43#include <linux/errno.h>
44#include <linux/fcntl.h>
45#include <linux/stat.h>
46#include <linux/string.h>
47#include <linux/buffer_head.h>
48#include <linux/workqueue.h>
49#include <linux/writeback.h>
50#include <linux/blkdev.h>
51#include <linux/backing-dev.h>
52#include <linux/uaccess.h>
53#include <linux/slab.h>
54
55
56/* gets a struct reiserfs_journal_list * from a list head */
57#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
58                               j_list))
59
60/* must be correct to keep the desc and commit structs at 4k */
61#define JOURNAL_TRANS_HALF 1018
62#define BUFNR 64		/*read ahead */
63
64/* cnode stat bits.  Move these into reiserfs_fs.h */
65
66/* this block was freed, and can't be written.  */
67#define BLOCK_FREED 2
68/* this block was freed during this transaction, and can't be written */
69#define BLOCK_FREED_HOLDER 3
70
71/* used in flush_journal_list */
72#define BLOCK_NEEDS_FLUSH 4
73#define BLOCK_DIRTIED 5
74
75/* journal list state bits */
76#define LIST_TOUCHED 1
77#define LIST_DIRTY   2
78#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
79
80/* flags for do_journal_end */
81#define FLUSH_ALL   1		/* flush commit and real blocks */
82#define COMMIT_NOW  2		/* end and commit this transaction */
83#define WAIT        4		/* wait for the log blocks to hit the disk */
84
85static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
86static int flush_journal_list(struct super_block *s,
87			      struct reiserfs_journal_list *jl, int flushall);
88static int flush_commit_list(struct super_block *s,
89			     struct reiserfs_journal_list *jl, int flushall);
90static int can_dirty(struct reiserfs_journal_cnode *cn);
91static int journal_join(struct reiserfs_transaction_handle *th,
92			struct super_block *sb);
93static void release_journal_dev(struct super_block *super,
94			       struct reiserfs_journal *journal);
95static void dirty_one_transaction(struct super_block *s,
96				 struct reiserfs_journal_list *jl);
97static void flush_async_commits(struct work_struct *work);
98static void queue_log_writer(struct super_block *s);
99
100/* values for join in do_journal_begin_r */
101enum {
102	JBEGIN_REG = 0,		/* regular journal begin */
103	/* join the running transaction if at all possible */
104	JBEGIN_JOIN = 1,
105	/* called from cleanup code, ignores aborted flag */
106	JBEGIN_ABORT = 2,
107};
108
109static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
110			      struct super_block *sb,
111			      unsigned long nblocks, int join);
112
113static void init_journal_hash(struct super_block *sb)
114{
115	struct reiserfs_journal *journal = SB_JOURNAL(sb);
116	memset(journal->j_hash_table, 0,
117	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
118}
119
120/*
121 * clears BH_Dirty and sticks the buffer on the clean list.  Called because
122 * I can't allow refile_buffer to make schedule happen after I've freed a
123 * block.  Look at remove_from_transaction and journal_mark_freed for
124 * more details.
125 */
126static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
127{
128	if (bh) {
129		clear_buffer_dirty(bh);
130		clear_buffer_journal_test(bh);
131	}
132	return 0;
133}
134
135static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
136							 *sb)
137{
138	struct reiserfs_bitmap_node *bn;
139	static int id;
140
141	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
142	if (!bn) {
143		return NULL;
144	}
145	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
146	if (!bn->data) {
147		kfree(bn);
148		return NULL;
149	}
150	bn->id = id++;
151	INIT_LIST_HEAD(&bn->list);
152	return bn;
153}
154
155static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
156{
157	struct reiserfs_journal *journal = SB_JOURNAL(sb);
158	struct reiserfs_bitmap_node *bn = NULL;
159	struct list_head *entry = journal->j_bitmap_nodes.next;
160
161	journal->j_used_bitmap_nodes++;
162repeat:
163
164	if (entry != &journal->j_bitmap_nodes) {
165		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
166		list_del(entry);
167		memset(bn->data, 0, sb->s_blocksize);
168		journal->j_free_bitmap_nodes--;
169		return bn;
170	}
171	bn = allocate_bitmap_node(sb);
172	if (!bn) {
173		yield();
174		goto repeat;
175	}
176	return bn;
177}
178static inline void free_bitmap_node(struct super_block *sb,
179				    struct reiserfs_bitmap_node *bn)
180{
181	struct reiserfs_journal *journal = SB_JOURNAL(sb);
182	journal->j_used_bitmap_nodes--;
183	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
184		kfree(bn->data);
185		kfree(bn);
186	} else {
187		list_add(&bn->list, &journal->j_bitmap_nodes);
188		journal->j_free_bitmap_nodes++;
189	}
190}
191
192static void allocate_bitmap_nodes(struct super_block *sb)
193{
194	int i;
195	struct reiserfs_journal *journal = SB_JOURNAL(sb);
196	struct reiserfs_bitmap_node *bn = NULL;
197	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
198		bn = allocate_bitmap_node(sb);
199		if (bn) {
200			list_add(&bn->list, &journal->j_bitmap_nodes);
201			journal->j_free_bitmap_nodes++;
202		} else {
203			/* this is ok, we'll try again when more are needed */
204			break;
205		}
206	}
207}
208
209static int set_bit_in_list_bitmap(struct super_block *sb,
210				  b_blocknr_t block,
211				  struct reiserfs_list_bitmap *jb)
212{
213	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
214	unsigned int bit_nr = block % (sb->s_blocksize << 3);
215
216	if (!jb->bitmaps[bmap_nr]) {
217		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
218	}
219	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
220	return 0;
221}
222
223static void cleanup_bitmap_list(struct super_block *sb,
224				struct reiserfs_list_bitmap *jb)
225{
226	int i;
227	if (jb->bitmaps == NULL)
228		return;
229
230	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
231		if (jb->bitmaps[i]) {
232			free_bitmap_node(sb, jb->bitmaps[i]);
233			jb->bitmaps[i] = NULL;
234		}
235	}
236}
237
238/*
239 * only call this on FS unmount.
240 */
241static int free_list_bitmaps(struct super_block *sb,
242			     struct reiserfs_list_bitmap *jb_array)
243{
244	int i;
245	struct reiserfs_list_bitmap *jb;
246	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
247		jb = jb_array + i;
248		jb->journal_list = NULL;
249		cleanup_bitmap_list(sb, jb);
250		vfree(jb->bitmaps);
251		jb->bitmaps = NULL;
252	}
253	return 0;
254}
255
256static int free_bitmap_nodes(struct super_block *sb)
257{
258	struct reiserfs_journal *journal = SB_JOURNAL(sb);
259	struct list_head *next = journal->j_bitmap_nodes.next;
260	struct reiserfs_bitmap_node *bn;
261
262	while (next != &journal->j_bitmap_nodes) {
263		bn = list_entry(next, struct reiserfs_bitmap_node, list);
264		list_del(next);
265		kfree(bn->data);
266		kfree(bn);
267		next = journal->j_bitmap_nodes.next;
268		journal->j_free_bitmap_nodes--;
269	}
270
271	return 0;
272}
273
274/*
275 * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
276 * jb_array is the array to be filled in.
277 */
278int reiserfs_allocate_list_bitmaps(struct super_block *sb,
279				   struct reiserfs_list_bitmap *jb_array,
280				   unsigned int bmap_nr)
281{
282	int i;
283	int failed = 0;
284	struct reiserfs_list_bitmap *jb;
285	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
286
287	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
288		jb = jb_array + i;
289		jb->journal_list = NULL;
290		jb->bitmaps = vzalloc(mem);
291		if (!jb->bitmaps) {
292			reiserfs_warning(sb, "clm-2000", "unable to "
293					 "allocate bitmaps for journal lists");
294			failed = 1;
295			break;
296		}
297	}
298	if (failed) {
299		free_list_bitmaps(sb, jb_array);
300		return -1;
301	}
302	return 0;
303}
304
305/*
306 * find an available list bitmap.  If you can't find one, flush a commit list
307 * and try again
308 */
309static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
310						    struct reiserfs_journal_list
311						    *jl)
312{
313	int i, j;
314	struct reiserfs_journal *journal = SB_JOURNAL(sb);
315	struct reiserfs_list_bitmap *jb = NULL;
316
317	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
318		i = journal->j_list_bitmap_index;
319		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
320		jb = journal->j_list_bitmap + i;
321		if (journal->j_list_bitmap[i].journal_list) {
322			flush_commit_list(sb,
323					  journal->j_list_bitmap[i].
324					  journal_list, 1);
325			if (!journal->j_list_bitmap[i].journal_list) {
326				break;
327			}
328		} else {
329			break;
330		}
331	}
332	/* double check to make sure if flushed correctly */
333	if (jb->journal_list)
334		return NULL;
335	jb->journal_list = jl;
336	return jb;
337}
338
339/*
340 * allocates a new chunk of X nodes, and links them all together as a list.
341 * Uses the cnode->next and cnode->prev pointers
342 * returns NULL on failure
343 */
344static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
345{
346	struct reiserfs_journal_cnode *head;
347	int i;
348	if (num_cnodes <= 0) {
349		return NULL;
350	}
351	head = vzalloc(array_size(num_cnodes,
352				  sizeof(struct reiserfs_journal_cnode)));
353	if (!head) {
354		return NULL;
355	}
356	head[0].prev = NULL;
357	head[0].next = head + 1;
358	for (i = 1; i < num_cnodes; i++) {
359		head[i].prev = head + (i - 1);
360		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
361	}
362	head[num_cnodes - 1].next = NULL;
363	return head;
364}
365
366/* pulls a cnode off the free list, or returns NULL on failure */
367static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
368{
369	struct reiserfs_journal_cnode *cn;
370	struct reiserfs_journal *journal = SB_JOURNAL(sb);
371
372	reiserfs_check_lock_depth(sb, "get_cnode");
373
374	if (journal->j_cnode_free <= 0) {
375		return NULL;
376	}
377	journal->j_cnode_used++;
378	journal->j_cnode_free--;
379	cn = journal->j_cnode_free_list;
380	if (!cn) {
381		return cn;
382	}
383	if (cn->next) {
384		cn->next->prev = NULL;
385	}
386	journal->j_cnode_free_list = cn->next;
387	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
388	return cn;
389}
390
391/*
392 * returns a cnode to the free list
393 */
394static void free_cnode(struct super_block *sb,
395		       struct reiserfs_journal_cnode *cn)
396{
397	struct reiserfs_journal *journal = SB_JOURNAL(sb);
398
399	reiserfs_check_lock_depth(sb, "free_cnode");
400
401	journal->j_cnode_used--;
402	journal->j_cnode_free++;
403	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
404	cn->next = journal->j_cnode_free_list;
405	if (journal->j_cnode_free_list) {
406		journal->j_cnode_free_list->prev = cn;
407	}
408	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
409	journal->j_cnode_free_list = cn;
410}
411
412static void clear_prepared_bits(struct buffer_head *bh)
413{
414	clear_buffer_journal_prepared(bh);
415	clear_buffer_journal_restore_dirty(bh);
416}
417
418/*
419 * return a cnode with same dev, block number and size in table,
420 * or null if not found
421 */
422static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
423								  super_block
424								  *sb,
425								  struct
426								  reiserfs_journal_cnode
427								  **table,
428								  long bl)
429{
430	struct reiserfs_journal_cnode *cn;
431	cn = journal_hash(table, sb, bl);
432	while (cn) {
433		if (cn->blocknr == bl && cn->sb == sb)
434			return cn;
435		cn = cn->hnext;
436	}
437	return (struct reiserfs_journal_cnode *)0;
438}
439
440/*
441 * this actually means 'can this block be reallocated yet?'.  If you set
442 * search_all, a block can only be allocated if it is not in the current
443 * transaction, was not freed by the current transaction, and has no chance
444 * of ever being overwritten by a replay after crashing.
445 *
446 * If you don't set search_all, a block can only be allocated if it is not
447 * in the current transaction.  Since deleting a block removes it from the
448 * current transaction, this case should never happen.  If you don't set
449 * search_all, make sure you never write the block without logging it.
450 *
451 * next_zero_bit is a suggestion about the next block to try for find_forward.
452 * when bl is rejected because it is set in a journal list bitmap, we search
453 * for the next zero bit in the bitmap that rejected bl.  Then, we return
454 * that through next_zero_bit for find_forward to try.
455 *
456 * Just because we return something in next_zero_bit does not mean we won't
457 * reject it on the next call to reiserfs_in_journal
458 */
459int reiserfs_in_journal(struct super_block *sb,
460			unsigned int bmap_nr, int bit_nr, int search_all,
461			b_blocknr_t * next_zero_bit)
462{
463	struct reiserfs_journal *journal = SB_JOURNAL(sb);
464	struct reiserfs_list_bitmap *jb;
465	int i;
466	unsigned long bl;
467
468	*next_zero_bit = 0;	/* always start this at zero. */
469
470	PROC_INFO_INC(sb, journal.in_journal);
471	/*
472	 * If we aren't doing a search_all, this is a metablock, and it
473	 * will be logged before use.  if we crash before the transaction
474	 * that freed it commits,  this transaction won't have committed
475	 * either, and the block will never be written
476	 */
477	if (search_all) {
478		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
479			PROC_INFO_INC(sb, journal.in_journal_bitmap);
480			jb = journal->j_list_bitmap + i;
481			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
482			    test_bit(bit_nr,
483				     (unsigned long *)jb->bitmaps[bmap_nr]->
484				     data)) {
485				*next_zero_bit =
486				    find_next_zero_bit((unsigned long *)
487						       (jb->bitmaps[bmap_nr]->
488							data),
489						       sb->s_blocksize << 3,
490						       bit_nr + 1);
491				return 1;
492			}
493		}
494	}
495
496	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
497	/* is it in any old transactions? */
498	if (search_all
499	    && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
500		return 1;
501	}
502
503	/* is it in the current transaction.  This should never happen */
504	if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
505		BUG();
506		return 1;
507	}
508
509	PROC_INFO_INC(sb, journal.in_journal_reusable);
510	/* safe for reuse */
511	return 0;
512}
513
514/* insert cn into table */
515static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
516				       struct reiserfs_journal_cnode *cn)
517{
518	struct reiserfs_journal_cnode *cn_orig;
519
520	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
521	cn->hnext = cn_orig;
522	cn->hprev = NULL;
523	if (cn_orig) {
524		cn_orig->hprev = cn;
525	}
526	journal_hash(table, cn->sb, cn->blocknr) = cn;
527}
528
529/* lock the current transaction */
530static inline void lock_journal(struct super_block *sb)
531{
532	PROC_INFO_INC(sb, journal.lock_journal);
533
534	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
535}
536
537/* unlock the current transaction */
538static inline void unlock_journal(struct super_block *sb)
539{
540	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
541}
542
543static inline void get_journal_list(struct reiserfs_journal_list *jl)
544{
545	jl->j_refcount++;
546}
547
548static inline void put_journal_list(struct super_block *s,
549				    struct reiserfs_journal_list *jl)
550{
551	if (jl->j_refcount < 1) {
552		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
553			       jl->j_trans_id, jl->j_refcount);
554	}
555	if (--jl->j_refcount == 0)
556		kfree(jl);
557}
558
559/*
560 * this used to be much more involved, and I'm keeping it just in case
561 * things get ugly again.  it gets called by flush_commit_list, and
562 * cleans up any data stored about blocks freed during a transaction.
563 */
564static void cleanup_freed_for_journal_list(struct super_block *sb,
565					   struct reiserfs_journal_list *jl)
566{
567
568	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
569	if (jb) {
570		cleanup_bitmap_list(sb, jb);
571	}
572	jl->j_list_bitmap->journal_list = NULL;
573	jl->j_list_bitmap = NULL;
574}
575
576static int journal_list_still_alive(struct super_block *s,
577				    unsigned int trans_id)
578{
579	struct reiserfs_journal *journal = SB_JOURNAL(s);
580	struct list_head *entry = &journal->j_journal_list;
581	struct reiserfs_journal_list *jl;
582
583	if (!list_empty(entry)) {
584		jl = JOURNAL_LIST_ENTRY(entry->next);
585		if (jl->j_trans_id <= trans_id) {
586			return 1;
587		}
588	}
589	return 0;
590}
591
592/*
593 * If page->mapping was null, we failed to truncate this page for
594 * some reason.  Most likely because it was truncated after being
595 * logged via data=journal.
596 *
597 * This does a check to see if the buffer belongs to one of these
598 * lost pages before doing the final put_bh.  If page->mapping was
599 * null, it tries to free buffers on the page, which should make the
600 * final put_page drop the page from the lru.
601 */
602static void release_buffer_page(struct buffer_head *bh)
603{
604	struct folio *folio = bh->b_folio;
605	if (!folio->mapping && folio_trylock(folio)) {
606		folio_get(folio);
607		put_bh(bh);
608		if (!folio->mapping)
609			try_to_free_buffers(folio);
610		folio_unlock(folio);
611		folio_put(folio);
612	} else {
613		put_bh(bh);
614	}
615}
616
617static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
618{
619	if (buffer_journaled(bh)) {
620		reiserfs_warning(NULL, "clm-2084",
621				 "pinned buffer %lu:%pg sent to disk",
622				 bh->b_blocknr, bh->b_bdev);
623	}
624	if (uptodate)
625		set_buffer_uptodate(bh);
626	else
627		clear_buffer_uptodate(bh);
628
629	unlock_buffer(bh);
630	release_buffer_page(bh);
631}
632
633static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
634{
635	if (uptodate)
636		set_buffer_uptodate(bh);
637	else
638		clear_buffer_uptodate(bh);
639	unlock_buffer(bh);
640	put_bh(bh);
641}
642
643static void submit_logged_buffer(struct buffer_head *bh)
644{
645	get_bh(bh);
646	bh->b_end_io = reiserfs_end_buffer_io_sync;
647	clear_buffer_journal_new(bh);
648	clear_buffer_dirty(bh);
649	if (!test_clear_buffer_journal_test(bh))
650		BUG();
651	if (!buffer_uptodate(bh))
652		BUG();
653	submit_bh(REQ_OP_WRITE, bh);
654}
655
656static void submit_ordered_buffer(struct buffer_head *bh)
657{
658	get_bh(bh);
659	bh->b_end_io = reiserfs_end_ordered_io;
660	clear_buffer_dirty(bh);
661	if (!buffer_uptodate(bh))
662		BUG();
663	submit_bh(REQ_OP_WRITE, bh);
664}
665
666#define CHUNK_SIZE 32
667struct buffer_chunk {
668	struct buffer_head *bh[CHUNK_SIZE];
669	int nr;
670};
671
672static void write_chunk(struct buffer_chunk *chunk)
673{
674	int i;
675	for (i = 0; i < chunk->nr; i++) {
676		submit_logged_buffer(chunk->bh[i]);
677	}
678	chunk->nr = 0;
679}
680
681static void write_ordered_chunk(struct buffer_chunk *chunk)
682{
683	int i;
684	for (i = 0; i < chunk->nr; i++) {
685		submit_ordered_buffer(chunk->bh[i]);
686	}
687	chunk->nr = 0;
688}
689
690static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
691			spinlock_t * lock, void (fn) (struct buffer_chunk *))
692{
693	int ret = 0;
694	BUG_ON(chunk->nr >= CHUNK_SIZE);
695	chunk->bh[chunk->nr++] = bh;
696	if (chunk->nr >= CHUNK_SIZE) {
697		ret = 1;
698		if (lock) {
699			spin_unlock(lock);
700			fn(chunk);
701			spin_lock(lock);
702		} else {
703			fn(chunk);
704		}
705	}
706	return ret;
707}
708
709static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
710static struct reiserfs_jh *alloc_jh(void)
711{
712	struct reiserfs_jh *jh;
713	while (1) {
714		jh = kmalloc(sizeof(*jh), GFP_NOFS);
715		if (jh) {
716			atomic_inc(&nr_reiserfs_jh);
717			return jh;
718		}
719		yield();
720	}
721}
722
723/*
724 * we want to free the jh when the buffer has been written
725 * and waited on
726 */
727void reiserfs_free_jh(struct buffer_head *bh)
728{
729	struct reiserfs_jh *jh;
730
731	jh = bh->b_private;
732	if (jh) {
733		bh->b_private = NULL;
734		jh->bh = NULL;
735		list_del_init(&jh->list);
736		kfree(jh);
737		if (atomic_read(&nr_reiserfs_jh) <= 0)
738			BUG();
739		atomic_dec(&nr_reiserfs_jh);
740		put_bh(bh);
741	}
742}
743
744static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
745			   int tail)
746{
747	struct reiserfs_jh *jh;
748
749	if (bh->b_private) {
750		spin_lock(&j->j_dirty_buffers_lock);
751		if (!bh->b_private) {
752			spin_unlock(&j->j_dirty_buffers_lock);
753			goto no_jh;
754		}
755		jh = bh->b_private;
756		list_del_init(&jh->list);
757	} else {
758no_jh:
759		get_bh(bh);
760		jh = alloc_jh();
761		spin_lock(&j->j_dirty_buffers_lock);
762		/*
763		 * buffer must be locked for __add_jh, should be able to have
764		 * two adds at the same time
765		 */
766		BUG_ON(bh->b_private);
767		jh->bh = bh;
768		bh->b_private = jh;
769	}
770	jh->jl = j->j_current_jl;
771	if (tail)
772		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
773	else {
774		list_add_tail(&jh->list, &jh->jl->j_bh_list);
775	}
776	spin_unlock(&j->j_dirty_buffers_lock);
777	return 0;
778}
779
780int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
781{
782	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
783}
784int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
785{
786	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
787}
788
789#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
790static int write_ordered_buffers(spinlock_t * lock,
791				 struct reiserfs_journal *j,
792				 struct reiserfs_journal_list *jl,
793				 struct list_head *list)
794{
795	struct buffer_head *bh;
796	struct reiserfs_jh *jh;
797	int ret = j->j_errno;
798	struct buffer_chunk chunk;
799	struct list_head tmp;
800	INIT_LIST_HEAD(&tmp);
801
802	chunk.nr = 0;
803	spin_lock(lock);
804	while (!list_empty(list)) {
805		jh = JH_ENTRY(list->next);
806		bh = jh->bh;
807		get_bh(bh);
808		if (!trylock_buffer(bh)) {
809			if (!buffer_dirty(bh)) {
810				list_move(&jh->list, &tmp);
811				goto loop_next;
812			}
813			spin_unlock(lock);
814			if (chunk.nr)
815				write_ordered_chunk(&chunk);
816			wait_on_buffer(bh);
817			cond_resched();
818			spin_lock(lock);
819			goto loop_next;
820		}
821		/*
822		 * in theory, dirty non-uptodate buffers should never get here,
823		 * but the upper layer io error paths still have a few quirks.
824		 * Handle them here as gracefully as we can
825		 */
826		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
827			clear_buffer_dirty(bh);
828			ret = -EIO;
829		}
830		if (buffer_dirty(bh)) {
831			list_move(&jh->list, &tmp);
832			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
833		} else {
834			reiserfs_free_jh(bh);
835			unlock_buffer(bh);
836		}
837loop_next:
838		put_bh(bh);
839		cond_resched_lock(lock);
840	}
841	if (chunk.nr) {
842		spin_unlock(lock);
843		write_ordered_chunk(&chunk);
844		spin_lock(lock);
845	}
846	while (!list_empty(&tmp)) {
847		jh = JH_ENTRY(tmp.prev);
848		bh = jh->bh;
849		get_bh(bh);
850		reiserfs_free_jh(bh);
851
852		if (buffer_locked(bh)) {
853			spin_unlock(lock);
854			wait_on_buffer(bh);
855			spin_lock(lock);
856		}
857		if (!buffer_uptodate(bh)) {
858			ret = -EIO;
859		}
860		/*
861		 * ugly interaction with invalidate_folio here.
862		 * reiserfs_invalidate_folio will pin any buffer that has a
863		 * valid journal head from an older transaction.  If someone
864		 * else sets our buffer dirty after we write it in the first
865		 * loop, and then someone truncates the page away, nobody
866		 * will ever write the buffer. We're safe if we write the
867		 * page one last time after freeing the journal header.
868		 */
869		if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
870			spin_unlock(lock);
871			write_dirty_buffer(bh, 0);
872			spin_lock(lock);
873		}
874		put_bh(bh);
875		cond_resched_lock(lock);
876	}
877	spin_unlock(lock);
878	return ret;
879}
880
881static int flush_older_commits(struct super_block *s,
882			       struct reiserfs_journal_list *jl)
883{
884	struct reiserfs_journal *journal = SB_JOURNAL(s);
885	struct reiserfs_journal_list *other_jl;
886	struct reiserfs_journal_list *first_jl;
887	struct list_head *entry;
888	unsigned int trans_id = jl->j_trans_id;
889	unsigned int other_trans_id;
890
891find_first:
892	/*
893	 * first we walk backwards to find the oldest uncommitted transation
894	 */
895	first_jl = jl;
896	entry = jl->j_list.prev;
897	while (1) {
898		other_jl = JOURNAL_LIST_ENTRY(entry);
899		if (entry == &journal->j_journal_list ||
900		    atomic_read(&other_jl->j_older_commits_done))
901			break;
902
903		first_jl = other_jl;
904		entry = other_jl->j_list.prev;
905	}
906
907	/* if we didn't find any older uncommitted transactions, return now */
908	if (first_jl == jl) {
909		return 0;
910	}
911
912	entry = &first_jl->j_list;
913	while (1) {
914		other_jl = JOURNAL_LIST_ENTRY(entry);
915		other_trans_id = other_jl->j_trans_id;
916
917		if (other_trans_id < trans_id) {
918			if (atomic_read(&other_jl->j_commit_left) != 0) {
919				flush_commit_list(s, other_jl, 0);
920
921				/* list we were called with is gone, return */
922				if (!journal_list_still_alive(s, trans_id))
923					return 1;
924
925				/*
926				 * the one we just flushed is gone, this means
927				 * all older lists are also gone, so first_jl
928				 * is no longer valid either.  Go back to the
929				 * beginning.
930				 */
931				if (!journal_list_still_alive
932				    (s, other_trans_id)) {
933					goto find_first;
934				}
935			}
936			entry = entry->next;
937			if (entry == &journal->j_journal_list)
938				return 0;
939		} else {
940			return 0;
941		}
942	}
943	return 0;
944}
945
946static int reiserfs_async_progress_wait(struct super_block *s)
947{
948	struct reiserfs_journal *j = SB_JOURNAL(s);
949
950	if (atomic_read(&j->j_async_throttle)) {
951		int depth;
952
953		depth = reiserfs_write_unlock_nested(s);
954		wait_var_event_timeout(&j->j_async_throttle,
955				       atomic_read(&j->j_async_throttle) == 0,
956				       HZ / 10);
957		reiserfs_write_lock_nested(s, depth);
958	}
959
960	return 0;
961}
962
963/*
964 * if this journal list still has commit blocks unflushed, send them to disk.
965 *
966 * log areas must be flushed in order (transaction 2 can't commit before
967 * transaction 1) Before the commit block can by written, every other log
968 * block must be safely on disk
969 */
970static int flush_commit_list(struct super_block *s,
971			     struct reiserfs_journal_list *jl, int flushall)
972{
973	int i;
974	b_blocknr_t bn;
975	struct buffer_head *tbh = NULL;
976	unsigned int trans_id = jl->j_trans_id;
977	struct reiserfs_journal *journal = SB_JOURNAL(s);
978	int retval = 0;
979	int write_len;
980	int depth;
981
982	reiserfs_check_lock_depth(s, "flush_commit_list");
983
984	if (atomic_read(&jl->j_older_commits_done)) {
985		return 0;
986	}
987
988	/*
989	 * before we can put our commit blocks on disk, we have to make
990	 * sure everyone older than us is on disk too
991	 */
992	BUG_ON(jl->j_len <= 0);
993	BUG_ON(trans_id == journal->j_trans_id);
994
995	get_journal_list(jl);
996	if (flushall) {
997		if (flush_older_commits(s, jl) == 1) {
998			/*
999			 * list disappeared during flush_older_commits.
1000			 * return
1001			 */
1002			goto put_jl;
1003		}
1004	}
1005
1006	/* make sure nobody is trying to flush this one at the same time */
1007	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
1008
1009	if (!journal_list_still_alive(s, trans_id)) {
1010		mutex_unlock(&jl->j_commit_mutex);
1011		goto put_jl;
1012	}
1013	BUG_ON(jl->j_trans_id == 0);
1014
1015	/* this commit is done, exit */
1016	if (atomic_read(&jl->j_commit_left) <= 0) {
1017		if (flushall) {
1018			atomic_set(&jl->j_older_commits_done, 1);
1019		}
1020		mutex_unlock(&jl->j_commit_mutex);
1021		goto put_jl;
1022	}
1023
1024	if (!list_empty(&jl->j_bh_list)) {
1025		int ret;
1026
1027		/*
1028		 * We might sleep in numerous places inside
1029		 * write_ordered_buffers. Relax the write lock.
1030		 */
1031		depth = reiserfs_write_unlock_nested(s);
1032		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1033					    journal, jl, &jl->j_bh_list);
1034		if (ret < 0 && retval == 0)
1035			retval = ret;
1036		reiserfs_write_lock_nested(s, depth);
1037	}
1038	BUG_ON(!list_empty(&jl->j_bh_list));
1039	/*
1040	 * for the description block and all the log blocks, submit any buffers
1041	 * that haven't already reached the disk.  Try to write at least 256
1042	 * log blocks. later on, we will only wait on blocks that correspond
1043	 * to this transaction, but while we're unplugging we might as well
1044	 * get a chunk of data on there.
1045	 */
1046	atomic_inc(&journal->j_async_throttle);
1047	write_len = jl->j_len + 1;
1048	if (write_len < 256)
1049		write_len = 256;
1050	for (i = 0 ; i < write_len ; i++) {
1051		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
1052		    SB_ONDISK_JOURNAL_SIZE(s);
1053		tbh = journal_find_get_block(s, bn);
1054		if (tbh) {
1055			if (buffer_dirty(tbh)) {
1056		            depth = reiserfs_write_unlock_nested(s);
1057			    write_dirty_buffer(tbh, 0);
1058			    reiserfs_write_lock_nested(s, depth);
1059			}
1060			put_bh(tbh) ;
1061		}
1062	}
1063	if (atomic_dec_and_test(&journal->j_async_throttle))
1064		wake_up_var(&journal->j_async_throttle);
1065
1066	for (i = 0; i < (jl->j_len + 1); i++) {
1067		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1068		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1069		tbh = journal_find_get_block(s, bn);
1070
1071		depth = reiserfs_write_unlock_nested(s);
1072		__wait_on_buffer(tbh);
1073		reiserfs_write_lock_nested(s, depth);
1074		/*
1075		 * since we're using ll_rw_blk above, it might have skipped
1076		 * over a locked buffer.  Double check here
1077		 */
1078		/* redundant, sync_dirty_buffer() checks */
1079		if (buffer_dirty(tbh)) {
1080			depth = reiserfs_write_unlock_nested(s);
1081			sync_dirty_buffer(tbh);
1082			reiserfs_write_lock_nested(s, depth);
1083		}
1084		if (unlikely(!buffer_uptodate(tbh))) {
1085#ifdef CONFIG_REISERFS_CHECK
1086			reiserfs_warning(s, "journal-601",
1087					 "buffer write failed");
1088#endif
1089			retval = -EIO;
1090		}
1091		/* once for journal_find_get_block */
1092		put_bh(tbh);
1093		/* once due to original getblk in do_journal_end */
1094		put_bh(tbh);
1095		atomic_dec(&jl->j_commit_left);
1096	}
1097
1098	BUG_ON(atomic_read(&jl->j_commit_left) != 1);
1099
1100	/*
1101	 * If there was a write error in the journal - we can't commit
1102	 * this transaction - it will be invalid and, if successful,
1103	 * will just end up propagating the write error out to
1104	 * the file system.
1105	 */
1106	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1107		if (buffer_dirty(jl->j_commit_bh))
1108			BUG();
1109		mark_buffer_dirty(jl->j_commit_bh) ;
1110		depth = reiserfs_write_unlock_nested(s);
1111		if (reiserfs_barrier_flush(s))
1112			__sync_dirty_buffer(jl->j_commit_bh,
1113					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
1114		else
1115			sync_dirty_buffer(jl->j_commit_bh);
1116		reiserfs_write_lock_nested(s, depth);
1117	}
1118
1119	/*
1120	 * If there was a write error in the journal - we can't commit this
1121	 * transaction - it will be invalid and, if successful, will just end
1122	 * up propagating the write error out to the filesystem.
1123	 */
1124	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
1125#ifdef CONFIG_REISERFS_CHECK
1126		reiserfs_warning(s, "journal-615", "buffer write failed");
1127#endif
1128		retval = -EIO;
1129	}
1130	bforget(jl->j_commit_bh);
1131	if (journal->j_last_commit_id != 0 &&
1132	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
1133		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
1134				 journal->j_last_commit_id, jl->j_trans_id);
1135	}
1136	journal->j_last_commit_id = jl->j_trans_id;
1137
1138	/*
1139	 * now, every commit block is on the disk.  It is safe to allow
1140	 * blocks freed during this transaction to be reallocated
1141	 */
1142	cleanup_freed_for_journal_list(s, jl);
1143
1144	retval = retval ? retval : journal->j_errno;
1145
1146	/* mark the metadata dirty */
1147	if (!retval)
1148		dirty_one_transaction(s, jl);
1149	atomic_dec(&jl->j_commit_left);
1150
1151	if (flushall) {
1152		atomic_set(&jl->j_older_commits_done, 1);
1153	}
1154	mutex_unlock(&jl->j_commit_mutex);
1155put_jl:
1156	put_journal_list(s, jl);
1157
1158	if (retval)
1159		reiserfs_abort(s, retval, "Journal write error in %s",
1160			       __func__);
1161	return retval;
1162}
1163
1164/*
1165 * flush_journal_list frequently needs to find a newer transaction for a
1166 * given block.  This does that, or returns NULL if it can't find anything
1167 */
1168static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
1169							  reiserfs_journal_cnode
1170							  *cn)
1171{
1172	struct super_block *sb = cn->sb;
1173	b_blocknr_t blocknr = cn->blocknr;
1174
1175	cn = cn->hprev;
1176	while (cn) {
1177		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
1178			return cn->jlist;
1179		}
1180		cn = cn->hprev;
1181	}
1182	return NULL;
1183}
1184
1185static void remove_journal_hash(struct super_block *,
1186				struct reiserfs_journal_cnode **,
1187				struct reiserfs_journal_list *, unsigned long,
1188				int);
1189
1190/*
1191 * once all the real blocks have been flushed, it is safe to remove them
1192 * from the journal list for this transaction.  Aside from freeing the
1193 * cnode, this also allows the block to be reallocated for data blocks
1194 * if it had been deleted.
1195 */
1196static void remove_all_from_journal_list(struct super_block *sb,
1197					 struct reiserfs_journal_list *jl,
1198					 int debug)
1199{
1200	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1201	struct reiserfs_journal_cnode *cn, *last;
1202	cn = jl->j_realblock;
1203
1204	/*
1205	 * which is better, to lock once around the whole loop, or
1206	 * to lock for each call to remove_journal_hash?
1207	 */
1208	while (cn) {
1209		if (cn->blocknr != 0) {
1210			if (debug) {
1211				reiserfs_warning(sb, "reiserfs-2201",
1212						 "block %u, bh is %d, state %ld",
1213						 cn->blocknr, cn->bh ? 1 : 0,
1214						 cn->state);
1215			}
1216			cn->state = 0;
1217			remove_journal_hash(sb, journal->j_list_hash_table,
1218					    jl, cn->blocknr, 1);
1219		}
1220		last = cn;
1221		cn = cn->next;
1222		free_cnode(sb, last);
1223	}
1224	jl->j_realblock = NULL;
1225}
1226
1227/*
1228 * if this timestamp is greater than the timestamp we wrote last to the
1229 * header block, write it to the header block.  once this is done, I can
1230 * safely say the log area for this transaction won't ever be replayed,
1231 * and I can start releasing blocks in this transaction for reuse as data
1232 * blocks.  called by flush_journal_list, before it calls
1233 * remove_all_from_journal_list
1234 */
1235static int _update_journal_header_block(struct super_block *sb,
1236					unsigned long offset,
1237					unsigned int trans_id)
1238{
1239	struct reiserfs_journal_header *jh;
1240	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1241	int depth;
1242
1243	if (reiserfs_is_journal_aborted(journal))
1244		return -EIO;
1245
1246	if (trans_id >= journal->j_last_flush_trans_id) {
1247		if (buffer_locked((journal->j_header_bh))) {
1248			depth = reiserfs_write_unlock_nested(sb);
1249			__wait_on_buffer(journal->j_header_bh);
1250			reiserfs_write_lock_nested(sb, depth);
1251			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1252#ifdef CONFIG_REISERFS_CHECK
1253				reiserfs_warning(sb, "journal-699",
1254						 "buffer write failed");
1255#endif
1256				return -EIO;
1257			}
1258		}
1259		journal->j_last_flush_trans_id = trans_id;
1260		journal->j_first_unflushed_offset = offset;
1261		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
1262							b_data);
1263		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
1264		jh->j_first_unflushed_offset = cpu_to_le32(offset);
1265		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1266
1267		set_buffer_dirty(journal->j_header_bh);
1268		depth = reiserfs_write_unlock_nested(sb);
1269
1270		if (reiserfs_barrier_flush(sb))
1271			__sync_dirty_buffer(journal->j_header_bh,
1272					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
1273		else
1274			sync_dirty_buffer(journal->j_header_bh);
1275
1276		reiserfs_write_lock_nested(sb, depth);
1277		if (!buffer_uptodate(journal->j_header_bh)) {
1278			reiserfs_warning(sb, "journal-837",
1279					 "IO error during journal replay");
1280			return -EIO;
1281		}
1282	}
1283	return 0;
1284}
1285
1286static int update_journal_header_block(struct super_block *sb,
1287				       unsigned long offset,
1288				       unsigned int trans_id)
1289{
1290	return _update_journal_header_block(sb, offset, trans_id);
1291}
1292
1293/*
1294** flush any and all journal lists older than you are
1295** can only be called from flush_journal_list
1296*/
1297static int flush_older_journal_lists(struct super_block *sb,
1298				     struct reiserfs_journal_list *jl)
1299{
1300	struct list_head *entry;
1301	struct reiserfs_journal_list *other_jl;
1302	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1303	unsigned int trans_id = jl->j_trans_id;
1304
1305	/*
1306	 * we know we are the only ones flushing things, no extra race
1307	 * protection is required.
1308	 */
1309restart:
1310	entry = journal->j_journal_list.next;
1311	/* Did we wrap? */
1312	if (entry == &journal->j_journal_list)
1313		return 0;
1314	other_jl = JOURNAL_LIST_ENTRY(entry);
1315	if (other_jl->j_trans_id < trans_id) {
1316		BUG_ON(other_jl->j_refcount <= 0);
1317		/* do not flush all */
1318		flush_journal_list(sb, other_jl, 0);
1319
1320		/* other_jl is now deleted from the list */
1321		goto restart;
1322	}
1323	return 0;
1324}
1325
1326static void del_from_work_list(struct super_block *s,
1327			       struct reiserfs_journal_list *jl)
1328{
1329	struct reiserfs_journal *journal = SB_JOURNAL(s);
1330	if (!list_empty(&jl->j_working_list)) {
1331		list_del_init(&jl->j_working_list);
1332		journal->j_num_work_lists--;
1333	}
1334}
1335
1336/*
1337 * flush a journal list, both commit and real blocks
1338 *
1339 * always set flushall to 1, unless you are calling from inside
1340 * flush_journal_list
1341 *
1342 * IMPORTANT.  This can only be called while there are no journal writers,
1343 * and the journal is locked.  That means it can only be called from
1344 * do_journal_end, or by journal_release
1345 */
1346static int flush_journal_list(struct super_block *s,
1347			      struct reiserfs_journal_list *jl, int flushall)
1348{
1349	struct reiserfs_journal_list *pjl;
1350	struct reiserfs_journal_cnode *cn;
1351	int count;
1352	int was_jwait = 0;
1353	int was_dirty = 0;
1354	struct buffer_head *saved_bh;
1355	unsigned long j_len_saved = jl->j_len;
1356	struct reiserfs_journal *journal = SB_JOURNAL(s);
1357	int err = 0;
1358	int depth;
1359
1360	BUG_ON(j_len_saved <= 0);
1361
1362	if (atomic_read(&journal->j_wcount) != 0) {
1363		reiserfs_warning(s, "clm-2048", "called with wcount %d",
1364				 atomic_read(&journal->j_wcount));
1365	}
1366
1367	/* if flushall == 0, the lock is already held */
1368	if (flushall) {
1369		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1370	} else if (mutex_trylock(&journal->j_flush_mutex)) {
1371		BUG();
1372	}
1373
1374	count = 0;
1375	if (j_len_saved > journal->j_trans_max) {
1376		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
1377			       j_len_saved, jl->j_trans_id);
1378		return 0;
1379	}
1380
1381	/* if all the work is already done, get out of here */
1382	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
1383	    atomic_read(&jl->j_commit_left) <= 0) {
1384		goto flush_older_and_return;
1385	}
1386
1387	/*
1388	 * start by putting the commit list on disk.  This will also flush
1389	 * the commit lists of any olders transactions
1390	 */
1391	flush_commit_list(s, jl, 1);
1392
1393	if (!(jl->j_state & LIST_DIRTY)
1394	    && !reiserfs_is_journal_aborted(journal))
1395		BUG();
1396
1397	/* are we done now? */
1398	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
1399	    atomic_read(&jl->j_commit_left) <= 0) {
1400		goto flush_older_and_return;
1401	}
1402
1403	/*
1404	 * loop through each cnode, see if we need to write it,
1405	 * or wait on a more recent transaction, or just ignore it
1406	 */
1407	if (atomic_read(&journal->j_wcount) != 0) {
1408		reiserfs_panic(s, "journal-844", "journal list is flushing, "
1409			       "wcount is not 0");
1410	}
1411	cn = jl->j_realblock;
1412	while (cn) {
1413		was_jwait = 0;
1414		was_dirty = 0;
1415		saved_bh = NULL;
1416		/* blocknr of 0 is no longer in the hash, ignore it */
1417		if (cn->blocknr == 0) {
1418			goto free_cnode;
1419		}
1420
1421		/*
1422		 * This transaction failed commit.
1423		 * Don't write out to the disk
1424		 */
1425		if (!(jl->j_state & LIST_DIRTY))
1426			goto free_cnode;
1427
1428		pjl = find_newer_jl_for_cn(cn);
1429		/*
1430		 * the order is important here.  We check pjl to make sure we
1431		 * don't clear BH_JDirty_wait if we aren't the one writing this
1432		 * block to disk
1433		 */
1434		if (!pjl && cn->bh) {
1435			saved_bh = cn->bh;
1436
1437			/*
1438			 * we do this to make sure nobody releases the
1439			 * buffer while we are working with it
1440			 */
1441			get_bh(saved_bh);
1442
1443			if (buffer_journal_dirty(saved_bh)) {
1444				BUG_ON(!can_dirty(cn));
1445				was_jwait = 1;
1446				was_dirty = 1;
1447			} else if (can_dirty(cn)) {
1448				/*
1449				 * everything with !pjl && jwait
1450				 * should be writable
1451				 */
1452				BUG();
1453			}
1454		}
1455
1456		/*
1457		 * if someone has this block in a newer transaction, just make
1458		 * sure they are committed, and don't try writing it to disk
1459		 */
1460		if (pjl) {
1461			if (atomic_read(&pjl->j_commit_left))
1462				flush_commit_list(s, pjl, 1);
1463			goto free_cnode;
1464		}
1465
1466		/*
1467		 * bh == NULL when the block got to disk on its own, OR,
1468		 * the block got freed in a future transaction
1469		 */
1470		if (saved_bh == NULL) {
1471			goto free_cnode;
1472		}
1473
1474		/*
1475		 * this should never happen.  kupdate_one_transaction has
1476		 * this list locked while it works, so we should never see a
1477		 * buffer here that is not marked JDirty_wait
1478		 */
1479		if ((!was_jwait) && !buffer_locked(saved_bh)) {
1480			reiserfs_warning(s, "journal-813",
1481					 "BAD! buffer %llu %cdirty %cjwait, "
1482					 "not in a newer transaction",
1483					 (unsigned long long)saved_bh->
1484					 b_blocknr, was_dirty ? ' ' : '!',
1485					 was_jwait ? ' ' : '!');
1486		}
1487		if (was_dirty) {
1488			/*
1489			 * we inc again because saved_bh gets decremented
1490			 * at free_cnode
1491			 */
1492			get_bh(saved_bh);
1493			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
1494			lock_buffer(saved_bh);
1495			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
1496			if (buffer_dirty(saved_bh))
1497				submit_logged_buffer(saved_bh);
1498			else
1499				unlock_buffer(saved_bh);
1500			count++;
1501		} else {
1502			reiserfs_warning(s, "clm-2082",
1503					 "Unable to flush buffer %llu in %s",
1504					 (unsigned long long)saved_bh->
1505					 b_blocknr, __func__);
1506		}
1507free_cnode:
1508		cn = cn->next;
1509		if (saved_bh) {
1510			/*
1511			 * we incremented this to keep others from
1512			 * taking the buffer head away
1513			 */
1514			put_bh(saved_bh);
1515			if (atomic_read(&saved_bh->b_count) < 0) {
1516				reiserfs_warning(s, "journal-945",
1517						 "saved_bh->b_count < 0");
1518			}
1519		}
1520	}
1521	if (count > 0) {
1522		cn = jl->j_realblock;
1523		while (cn) {
1524			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
1525				if (!cn->bh) {
1526					reiserfs_panic(s, "journal-1011",
1527						       "cn->bh is NULL");
1528				}
1529
1530				depth = reiserfs_write_unlock_nested(s);
1531				__wait_on_buffer(cn->bh);
1532				reiserfs_write_lock_nested(s, depth);
1533
1534				if (!cn->bh) {
1535					reiserfs_panic(s, "journal-1012",
1536						       "cn->bh is NULL");
1537				}
1538				if (unlikely(!buffer_uptodate(cn->bh))) {
1539#ifdef CONFIG_REISERFS_CHECK
1540					reiserfs_warning(s, "journal-949",
1541							 "buffer write failed");
1542#endif
1543					err = -EIO;
1544				}
1545				/*
1546				 * note, we must clear the JDirty_wait bit
1547				 * after the up to date check, otherwise we
1548				 * race against our flushpage routine
1549				 */
1550				BUG_ON(!test_clear_buffer_journal_dirty
1551				       (cn->bh));
1552
1553				/* drop one ref for us */
1554				put_bh(cn->bh);
1555				/* drop one ref for journal_mark_dirty */
1556				release_buffer_page(cn->bh);
1557			}
1558			cn = cn->next;
1559		}
1560	}
1561
1562	if (err)
1563		reiserfs_abort(s, -EIO,
1564			       "Write error while pushing transaction to disk in %s",
1565			       __func__);
1566flush_older_and_return:
1567
1568	/*
1569	 * before we can update the journal header block, we _must_ flush all
1570	 * real blocks from all older transactions to disk.  This is because
1571	 * once the header block is updated, this transaction will not be
1572	 * replayed after a crash
1573	 */
1574	if (flushall) {
1575		flush_older_journal_lists(s, jl);
1576	}
1577
1578	err = journal->j_errno;
1579	/*
1580	 * before we can remove everything from the hash tables for this
1581	 * transaction, we must make sure it can never be replayed
1582	 *
1583	 * since we are only called from do_journal_end, we know for sure there
1584	 * are no allocations going on while we are flushing journal lists.  So,
1585	 * we only need to update the journal header block for the last list
1586	 * being flushed
1587	 */
1588	if (!err && flushall) {
1589		err =
1590		    update_journal_header_block(s,
1591						(jl->j_start + jl->j_len +
1592						 2) % SB_ONDISK_JOURNAL_SIZE(s),
1593						jl->j_trans_id);
1594		if (err)
1595			reiserfs_abort(s, -EIO,
1596				       "Write error while updating journal header in %s",
1597				       __func__);
1598	}
1599	remove_all_from_journal_list(s, jl, 0);
1600	list_del_init(&jl->j_list);
1601	journal->j_num_lists--;
1602	del_from_work_list(s, jl);
1603
1604	if (journal->j_last_flush_id != 0 &&
1605	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
1606		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
1607				 journal->j_last_flush_id, jl->j_trans_id);
1608	}
1609	journal->j_last_flush_id = jl->j_trans_id;
1610
1611	/*
1612	 * not strictly required since we are freeing the list, but it should
1613	 * help find code using dead lists later on
1614	 */
1615	jl->j_len = 0;
1616	atomic_set(&jl->j_nonzerolen, 0);
1617	jl->j_start = 0;
1618	jl->j_realblock = NULL;
1619	jl->j_commit_bh = NULL;
1620	jl->j_trans_id = 0;
1621	jl->j_state = 0;
1622	put_journal_list(s, jl);
1623	if (flushall)
1624		mutex_unlock(&journal->j_flush_mutex);
1625	return err;
1626}
1627
1628static int write_one_transaction(struct super_block *s,
1629				 struct reiserfs_journal_list *jl,
1630				 struct buffer_chunk *chunk)
1631{
1632	struct reiserfs_journal_cnode *cn;
1633	int ret = 0;
1634
1635	jl->j_state |= LIST_TOUCHED;
1636	del_from_work_list(s, jl);
1637	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
1638		return 0;
1639	}
1640
1641	cn = jl->j_realblock;
1642	while (cn) {
1643		/*
1644		 * if the blocknr == 0, this has been cleared from the hash,
1645		 * skip it
1646		 */
1647		if (cn->blocknr == 0) {
1648			goto next;
1649		}
1650		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
1651			struct buffer_head *tmp_bh;
1652			/*
1653			 * we can race against journal_mark_freed when we try
1654			 * to lock_buffer(cn->bh), so we have to inc the buffer
1655			 * count, and recheck things after locking
1656			 */
1657			tmp_bh = cn->bh;
1658			get_bh(tmp_bh);
1659			lock_buffer(tmp_bh);
1660			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
1661				if (!buffer_journal_dirty(tmp_bh) ||
1662				    buffer_journal_prepared(tmp_bh))
1663					BUG();
1664				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
1665				ret++;
1666			} else {
1667				/* note, cn->bh might be null now */
1668				unlock_buffer(tmp_bh);
1669			}
1670			put_bh(tmp_bh);
1671		}
1672next:
1673		cn = cn->next;
1674		cond_resched();
1675	}
1676	return ret;
1677}
1678
1679/* used by flush_commit_list */
1680static void dirty_one_transaction(struct super_block *s,
1681				 struct reiserfs_journal_list *jl)
1682{
1683	struct reiserfs_journal_cnode *cn;
1684	struct reiserfs_journal_list *pjl;
1685
1686	jl->j_state |= LIST_DIRTY;
1687	cn = jl->j_realblock;
1688	while (cn) {
1689		/*
1690		 * look for a more recent transaction that logged this
1691		 * buffer.  Only the most recent transaction with a buffer in
1692		 * it is allowed to send that buffer to disk
1693		 */
1694		pjl = find_newer_jl_for_cn(cn);
1695		if (!pjl && cn->blocknr && cn->bh
1696		    && buffer_journal_dirty(cn->bh)) {
1697			BUG_ON(!can_dirty(cn));
1698			/*
1699			 * if the buffer is prepared, it will either be logged
1700			 * or restored.  If restored, we need to make sure
1701			 * it actually gets marked dirty
1702			 */
1703			clear_buffer_journal_new(cn->bh);
1704			if (buffer_journal_prepared(cn->bh)) {
1705				set_buffer_journal_restore_dirty(cn->bh);
1706			} else {
1707				set_buffer_journal_test(cn->bh);
1708				mark_buffer_dirty(cn->bh);
1709			}
1710		}
1711		cn = cn->next;
1712	}
1713}
1714
1715static int kupdate_transactions(struct super_block *s,
1716				struct reiserfs_journal_list *jl,
1717				struct reiserfs_journal_list **next_jl,
1718				unsigned int *next_trans_id,
1719				int num_blocks, int num_trans)
1720{
1721	int ret = 0;
1722	int written = 0;
1723	int transactions_flushed = 0;
1724	unsigned int orig_trans_id = jl->j_trans_id;
1725	struct buffer_chunk chunk;
1726	struct list_head *entry;
1727	struct reiserfs_journal *journal = SB_JOURNAL(s);
1728	chunk.nr = 0;
1729
1730	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1731	if (!journal_list_still_alive(s, orig_trans_id)) {
1732		goto done;
1733	}
1734
1735	/*
1736	 * we've got j_flush_mutex held, nobody is going to delete any
1737	 * of these lists out from underneath us
1738	 */
1739	while ((num_trans && transactions_flushed < num_trans) ||
1740	       (!num_trans && written < num_blocks)) {
1741
1742		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
1743		    atomic_read(&jl->j_commit_left)
1744		    || !(jl->j_state & LIST_DIRTY)) {
1745			del_from_work_list(s, jl);
1746			break;
1747		}
1748		ret = write_one_transaction(s, jl, &chunk);
1749
1750		if (ret < 0)
1751			goto done;
1752		transactions_flushed++;
1753		written += ret;
1754		entry = jl->j_list.next;
1755
1756		/* did we wrap? */
1757		if (entry == &journal->j_journal_list) {
1758			break;
1759		}
1760		jl = JOURNAL_LIST_ENTRY(entry);
1761
1762		/* don't bother with older transactions */
1763		if (jl->j_trans_id <= orig_trans_id)
1764			break;
1765	}
1766	if (chunk.nr) {
1767		write_chunk(&chunk);
1768	}
1769
1770done:
1771	mutex_unlock(&journal->j_flush_mutex);
1772	return ret;
1773}
1774
1775/*
1776 * for o_sync and fsync heavy applications, they tend to use
1777 * all the journa list slots with tiny transactions.  These
1778 * trigger lots and lots of calls to update the header block, which
1779 * adds seeks and slows things down.
1780 *
1781 * This function tries to clear out a large chunk of the journal lists
1782 * at once, which makes everything faster since only the newest journal
1783 * list updates the header block
1784 */
1785static int flush_used_journal_lists(struct super_block *s,
1786				    struct reiserfs_journal_list *jl)
1787{
1788	unsigned long len = 0;
1789	unsigned long cur_len;
1790	int i;
1791	int limit = 256;
1792	struct reiserfs_journal_list *tjl;
1793	struct reiserfs_journal_list *flush_jl;
1794	unsigned int trans_id;
1795	struct reiserfs_journal *journal = SB_JOURNAL(s);
1796
1797	flush_jl = tjl = jl;
1798
1799	/* in data logging mode, try harder to flush a lot of blocks */
1800	if (reiserfs_data_log(s))
1801		limit = 1024;
1802	/* flush for 256 transactions or limit blocks, whichever comes first */
1803	for (i = 0; i < 256 && len < limit; i++) {
1804		if (atomic_read(&tjl->j_commit_left) ||
1805		    tjl->j_trans_id < jl->j_trans_id) {
1806			break;
1807		}
1808		cur_len = atomic_read(&tjl->j_nonzerolen);
1809		if (cur_len > 0) {
1810			tjl->j_state &= ~LIST_TOUCHED;
1811		}
1812		len += cur_len;
1813		flush_jl = tjl;
1814		if (tjl->j_list.next == &journal->j_journal_list)
1815			break;
1816		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
1817	}
1818	get_journal_list(jl);
1819	get_journal_list(flush_jl);
1820
1821	/*
1822	 * try to find a group of blocks we can flush across all the
1823	 * transactions, but only bother if we've actually spanned
1824	 * across multiple lists
1825	 */
1826	if (flush_jl != jl)
1827		kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
1828
1829	flush_journal_list(s, flush_jl, 1);
1830	put_journal_list(s, flush_jl);
1831	put_journal_list(s, jl);
1832	return 0;
1833}
1834
1835/*
1836 * removes any nodes in table with name block and dev as bh.
1837 * only touchs the hnext and hprev pointers.
1838 */
1839static void remove_journal_hash(struct super_block *sb,
1840			 struct reiserfs_journal_cnode **table,
1841			 struct reiserfs_journal_list *jl,
1842			 unsigned long block, int remove_freed)
1843{
1844	struct reiserfs_journal_cnode *cur;
1845	struct reiserfs_journal_cnode **head;
1846
1847	head = &(journal_hash(table, sb, block));
1848	if (!head) {
1849		return;
1850	}
1851	cur = *head;
1852	while (cur) {
1853		if (cur->blocknr == block && cur->sb == sb
1854		    && (jl == NULL || jl == cur->jlist)
1855		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
1856			if (cur->hnext) {
1857				cur->hnext->hprev = cur->hprev;
1858			}
1859			if (cur->hprev) {
1860				cur->hprev->hnext = cur->hnext;
1861			} else {
1862				*head = cur->hnext;
1863			}
1864			cur->blocknr = 0;
1865			cur->sb = NULL;
1866			cur->state = 0;
1867			/*
1868			 * anybody who clears the cur->bh will also
1869			 * dec the nonzerolen
1870			 */
1871			if (cur->bh && cur->jlist)
1872				atomic_dec(&cur->jlist->j_nonzerolen);
1873			cur->bh = NULL;
1874			cur->jlist = NULL;
1875		}
1876		cur = cur->hnext;
1877	}
1878}
1879
1880static void free_journal_ram(struct super_block *sb)
1881{
1882	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1883	kfree(journal->j_current_jl);
1884	journal->j_num_lists--;
1885
1886	vfree(journal->j_cnode_free_orig);
1887	free_list_bitmaps(sb, journal->j_list_bitmap);
1888	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
1889	if (journal->j_header_bh) {
1890		brelse(journal->j_header_bh);
1891	}
1892	/*
1893	 * j_header_bh is on the journal dev, make sure
1894	 * not to release the journal dev until we brelse j_header_bh
1895	 */
1896	release_journal_dev(sb, journal);
1897	vfree(journal);
1898}
1899
1900/*
1901 * call on unmount.  Only set error to 1 if you haven't made your way out
1902 * of read_super() yet.  Any other caller must keep error at 0.
1903 */
1904static int do_journal_release(struct reiserfs_transaction_handle *th,
1905			      struct super_block *sb, int error)
1906{
1907	struct reiserfs_transaction_handle myth;
1908	struct reiserfs_journal *journal = SB_JOURNAL(sb);
1909
1910	/*
1911	 * we only want to flush out transactions if we were
1912	 * called with error == 0
1913	 */
1914	if (!error && !sb_rdonly(sb)) {
1915		/* end the current trans */
1916		BUG_ON(!th->t_trans_id);
1917		do_journal_end(th, FLUSH_ALL);
1918
1919		/*
1920		 * make sure something gets logged to force
1921		 * our way into the flush code
1922		 */
1923		if (!journal_join(&myth, sb)) {
1924			reiserfs_prepare_for_journal(sb,
1925						     SB_BUFFER_WITH_SB(sb),
1926						     1);
1927			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
1928			do_journal_end(&myth, FLUSH_ALL);
1929		}
1930	}
1931
1932	/* this also catches errors during the do_journal_end above */
1933	if (!error && reiserfs_is_journal_aborted(journal)) {
1934		memset(&myth, 0, sizeof(myth));
1935		if (!journal_join_abort(&myth, sb)) {
1936			reiserfs_prepare_for_journal(sb,
1937						     SB_BUFFER_WITH_SB(sb),
1938						     1);
1939			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
1940			do_journal_end(&myth, FLUSH_ALL);
1941		}
1942	}
1943
1944
1945	/*
1946	 * We must release the write lock here because
1947	 * the workqueue job (flush_async_commit) needs this lock
1948	 */
1949	reiserfs_write_unlock(sb);
1950
1951	/*
1952	 * Cancel flushing of old commits. Note that neither of these works
1953	 * will be requeued because superblock is being shutdown and doesn't
1954	 * have SB_ACTIVE set.
1955	 */
1956	reiserfs_cancel_old_flush(sb);
1957	/* wait for all commits to finish */
1958	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
1959
1960	free_journal_ram(sb);
1961
1962	reiserfs_write_lock(sb);
1963
1964	return 0;
1965}
1966
1967/* * call on unmount.  flush all journal trans, release all alloc'd ram */
1968int journal_release(struct reiserfs_transaction_handle *th,
1969		    struct super_block *sb)
1970{
1971	return do_journal_release(th, sb, 0);
1972}
1973
1974/* only call from an error condition inside reiserfs_read_super!  */
1975int journal_release_error(struct reiserfs_transaction_handle *th,
1976			  struct super_block *sb)
1977{
1978	return do_journal_release(th, sb, 1);
1979}
1980
1981/*
1982 * compares description block with commit block.
1983 * returns 1 if they differ, 0 if they are the same
1984 */
1985static int journal_compare_desc_commit(struct super_block *sb,
1986				       struct reiserfs_journal_desc *desc,
1987				       struct reiserfs_journal_commit *commit)
1988{
1989	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
1990	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
1991	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
1992	    get_commit_trans_len(commit) <= 0) {
1993		return 1;
1994	}
1995	return 0;
1996}
1997
1998/*
1999 * returns 0 if it did not find a description block
2000 * returns -1 if it found a corrupt commit block
2001 * returns 1 if both desc and commit were valid
2002 * NOTE: only called during fs mount
2003 */
2004static int journal_transaction_is_valid(struct super_block *sb,
2005					struct buffer_head *d_bh,
2006					unsigned int *oldest_invalid_trans_id,
2007					unsigned long *newest_mount_id)
2008{
2009	struct reiserfs_journal_desc *desc;
2010	struct reiserfs_journal_commit *commit;
2011	struct buffer_head *c_bh;
2012	unsigned long offset;
2013
2014	if (!d_bh)
2015		return 0;
2016
2017	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2018	if (get_desc_trans_len(desc) > 0
2019	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
2020		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
2021		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
2022			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2023				       "journal-986: transaction "
2024				       "is valid returning because trans_id %d is greater than "
2025				       "oldest_invalid %lu",
2026				       get_desc_trans_id(desc),
2027				       *oldest_invalid_trans_id);
2028			return 0;
2029		}
2030		if (newest_mount_id
2031		    && *newest_mount_id > get_desc_mount_id(desc)) {
2032			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2033				       "journal-1087: transaction "
2034				       "is valid returning because mount_id %d is less than "
2035				       "newest_mount_id %lu",
2036				       get_desc_mount_id(desc),
2037				       *newest_mount_id);
2038			return -1;
2039		}
2040		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
2041			reiserfs_warning(sb, "journal-2018",
2042					 "Bad transaction length %d "
2043					 "encountered, ignoring transaction",
2044					 get_desc_trans_len(desc));
2045			return -1;
2046		}
2047		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2048
2049		/*
2050		 * ok, we have a journal description block,
2051		 * let's see if the transaction was valid
2052		 */
2053		c_bh =
2054		    journal_bread(sb,
2055				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2056				  ((offset + get_desc_trans_len(desc) +
2057				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
2058		if (!c_bh)
2059			return 0;
2060		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2061		if (journal_compare_desc_commit(sb, desc, commit)) {
2062			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2063				       "journal_transaction_is_valid, commit offset %ld had bad "
2064				       "time %d or length %d",
2065				       c_bh->b_blocknr -
2066				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2067				       get_commit_trans_id(commit),
2068				       get_commit_trans_len(commit));
2069			brelse(c_bh);
2070			if (oldest_invalid_trans_id) {
2071				*oldest_invalid_trans_id =
2072				    get_desc_trans_id(desc);
2073				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2074					       "journal-1004: "
2075					       "transaction_is_valid setting oldest invalid trans_id "
2076					       "to %d",
2077					       get_desc_trans_id(desc));
2078			}
2079			return -1;
2080		}
2081		brelse(c_bh);
2082		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2083			       "journal-1006: found valid "
2084			       "transaction start offset %llu, len %d id %d",
2085			       d_bh->b_blocknr -
2086			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2087			       get_desc_trans_len(desc),
2088			       get_desc_trans_id(desc));
2089		return 1;
2090	} else {
2091		return 0;
2092	}
2093}
2094
2095static void brelse_array(struct buffer_head **heads, int num)
2096{
2097	int i;
2098	for (i = 0; i < num; i++) {
2099		brelse(heads[i]);
2100	}
2101}
2102
2103/*
2104 * given the start, and values for the oldest acceptable transactions,
2105 * this either reads in a replays a transaction, or returns because the
2106 * transaction is invalid, or too old.
2107 * NOTE: only called during fs mount
2108 */
2109static int journal_read_transaction(struct super_block *sb,
2110				    unsigned long cur_dblock,
2111				    unsigned long oldest_start,
2112				    unsigned int oldest_trans_id,
2113				    unsigned long newest_mount_id)
2114{
2115	struct reiserfs_journal *journal = SB_JOURNAL(sb);
2116	struct reiserfs_journal_desc *desc;
2117	struct reiserfs_journal_commit *commit;
2118	unsigned int trans_id = 0;
2119	struct buffer_head *c_bh;
2120	struct buffer_head *d_bh;
2121	struct buffer_head **log_blocks = NULL;
2122	struct buffer_head **real_blocks = NULL;
2123	unsigned int trans_offset;
2124	int i;
2125	int trans_half;
2126
2127	d_bh = journal_bread(sb, cur_dblock);
2128	if (!d_bh)
2129		return 1;
2130	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2131	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2132	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
2133		       "journal_read_transaction, offset %llu, len %d mount_id %d",
2134		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2135		       get_desc_trans_len(desc), get_desc_mount_id(desc));
2136	if (get_desc_trans_id(desc) < oldest_trans_id) {
2137		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
2138			       "journal_read_trans skipping because %lu is too old",
2139			       cur_dblock -
2140			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2141		brelse(d_bh);
2142		return 1;
2143	}
2144	if (get_desc_mount_id(desc) != newest_mount_id) {
2145		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
2146			       "journal_read_trans skipping because %d is != "
2147			       "newest_mount_id %lu", get_desc_mount_id(desc),
2148			       newest_mount_id);
2149		brelse(d_bh);
2150		return 1;
2151	}
2152	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2153			     ((trans_offset + get_desc_trans_len(desc) + 1) %
2154			      SB_ONDISK_JOURNAL_SIZE(sb)));
2155	if (!c_bh) {
2156		brelse(d_bh);
2157		return 1;
2158	}
2159	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2160	if (journal_compare_desc_commit(sb, desc, commit)) {
2161		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2162			       "journal_read_transaction, "
2163			       "commit offset %llu had bad time %d or length %d",
2164			       c_bh->b_blocknr -
2165			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2166			       get_commit_trans_id(commit),
2167			       get_commit_trans_len(commit));
2168		brelse(c_bh);
2169		brelse(d_bh);
2170		return 1;
2171	}
2172
2173	if (bdev_read_only(sb->s_bdev)) {
2174		reiserfs_warning(sb, "clm-2076",
2175				 "device is readonly, unable to replay log");
2176		brelse(c_bh);
2177		brelse(d_bh);
2178		return -EROFS;
2179	}
2180
2181	trans_id = get_desc_trans_id(desc);
2182	/*
2183	 * now we know we've got a good transaction, and it was
2184	 * inside the valid time ranges
2185	 */
2186	log_blocks = kmalloc_array(get_desc_trans_len(desc),
2187				   sizeof(struct buffer_head *),
2188				   GFP_NOFS);
2189	real_blocks = kmalloc_array(get_desc_trans_len(desc),
2190				    sizeof(struct buffer_head *),
2191				    GFP_NOFS);
2192	if (!log_blocks || !real_blocks) {
2193		brelse(c_bh);
2194		brelse(d_bh);
2195		kfree(log_blocks);
2196		kfree(real_blocks);
2197		reiserfs_warning(sb, "journal-1169",
2198				 "kmalloc failed, unable to mount FS");
2199		return -1;
2200	}
2201	/* get all the buffer heads */
2202	trans_half = journal_trans_half(sb->s_blocksize);
2203	for (i = 0; i < get_desc_trans_len(desc); i++) {
2204		log_blocks[i] =
2205		    journal_getblk(sb,
2206				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2207				   (trans_offset + 1 +
2208				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
2209		if (i < trans_half) {
2210			real_blocks[i] =
2211			    sb_getblk(sb,
2212				      le32_to_cpu(desc->j_realblock[i]));
2213		} else {
2214			real_blocks[i] =
2215			    sb_getblk(sb,
2216				      le32_to_cpu(commit->
2217						  j_realblock[i - trans_half]));
2218		}
2219		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
2220			reiserfs_warning(sb, "journal-1207",
2221					 "REPLAY FAILURE fsck required! "
2222					 "Block to replay is outside of "
2223					 "filesystem");
2224			goto abort_replay;
2225		}
2226		/* make sure we don't try to replay onto log or reserved area */
2227		if (is_block_in_log_or_reserved_area
2228		    (sb, real_blocks[i]->b_blocknr)) {
2229			reiserfs_warning(sb, "journal-1204",
2230					 "REPLAY FAILURE fsck required! "
2231					 "Trying to replay onto a log block");
2232abort_replay:
2233			brelse_array(log_blocks, i);
2234			brelse_array(real_blocks, i);
2235			brelse(c_bh);
2236			brelse(d_bh);
2237			kfree(log_blocks);
2238			kfree(real_blocks);
2239			return -1;
2240		}
2241	}
2242	/* read in the log blocks, memcpy to the corresponding real block */
2243	bh_read_batch(get_desc_trans_len(desc), log_blocks);
2244	for (i = 0; i < get_desc_trans_len(desc); i++) {
2245
2246		wait_on_buffer(log_blocks[i]);
2247		if (!buffer_uptodate(log_blocks[i])) {
2248			reiserfs_warning(sb, "journal-1212",
2249					 "REPLAY FAILURE fsck required! "
2250					 "buffer write failed");
2251			brelse_array(log_blocks + i,
2252				     get_desc_trans_len(desc) - i);
2253			brelse_array(real_blocks, get_desc_trans_len(desc));
2254			brelse(c_bh);
2255			brelse(d_bh);
2256			kfree(log_blocks);
2257			kfree(real_blocks);
2258			return -1;
2259		}
2260		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
2261		       real_blocks[i]->b_size);
2262		set_buffer_uptodate(real_blocks[i]);
2263		brelse(log_blocks[i]);
2264	}
2265	/* flush out the real blocks */
2266	for (i = 0; i < get_desc_trans_len(desc); i++) {
2267		set_buffer_dirty(real_blocks[i]);
2268		write_dirty_buffer(real_blocks[i], 0);
2269	}
2270	for (i = 0; i < get_desc_trans_len(desc); i++) {
2271		wait_on_buffer(real_blocks[i]);
2272		if (!buffer_uptodate(real_blocks[i])) {
2273			reiserfs_warning(sb, "journal-1226",
2274					 "REPLAY FAILURE, fsck required! "
2275					 "buffer write failed");
2276			brelse_array(real_blocks + i,
2277				     get_desc_trans_len(desc) - i);
2278			brelse(c_bh);
2279			brelse(d_bh);
2280			kfree(log_blocks);
2281			kfree(real_blocks);
2282			return -1;
2283		}
2284		brelse(real_blocks[i]);
2285	}
2286	cur_dblock =
2287	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2288	    ((trans_offset + get_desc_trans_len(desc) +
2289	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
2290	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2291		       "journal-1095: setting journal " "start to offset %ld",
2292		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2293
2294	/*
2295	 * init starting values for the first transaction, in case
2296	 * this is the last transaction to be replayed.
2297	 */
2298	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2299	journal->j_last_flush_trans_id = trans_id;
2300	journal->j_trans_id = trans_id + 1;
2301	/* check for trans_id overflow */
2302	if (journal->j_trans_id == 0)
2303		journal->j_trans_id = 10;
2304	brelse(c_bh);
2305	brelse(d_bh);
2306	kfree(log_blocks);
2307	kfree(real_blocks);
2308	return 0;
2309}
2310
2311/*
2312 * This function reads blocks starting from block and to max_block of bufsize
2313 * size (but no more than BUFNR blocks at a time). This proved to improve
2314 * mounting speed on self-rebuilding raid5 arrays at least.
2315 * Right now it is only used from journal code. But later we might use it
2316 * from other places.
2317 * Note: Do not use journal_getblk/sb_getblk functions here!
2318 */
2319static struct buffer_head *reiserfs_breada(struct block_device *dev,
2320					   b_blocknr_t block, int bufsize,
2321					   b_blocknr_t max_block)
2322{
2323	struct buffer_head *bhlist[BUFNR];
2324	unsigned int blocks = BUFNR;
2325	struct buffer_head *bh;
2326	int i, j;
2327
2328	bh = __getblk(dev, block, bufsize);
2329	if (!bh || buffer_uptodate(bh))
2330		return (bh);
2331
2332	if (block + BUFNR > max_block) {
2333		blocks = max_block - block;
2334	}
2335	bhlist[0] = bh;
2336	j = 1;
2337	for (i = 1; i < blocks; i++) {
2338		bh = __getblk(dev, block + i, bufsize);
2339		if (!bh)
2340			break;
2341		if (buffer_uptodate(bh)) {
2342			brelse(bh);
2343			break;
2344		} else
2345			bhlist[j++] = bh;
2346	}
2347	bh = bhlist[0];
2348	bh_read_nowait(bh, 0);
2349	bh_readahead_batch(j - 1, &bhlist[1], 0);
2350	for (i = 1; i < j; i++)
2351		brelse(bhlist[i]);
2352	wait_on_buffer(bh);
2353	if (buffer_uptodate(bh))
2354		return bh;
2355	brelse(bh);
2356	return NULL;
2357}
2358
2359/*
2360 * read and replay the log
2361 * on a clean unmount, the journal header's next unflushed pointer will be
2362 * to an invalid transaction.  This tests that before finding all the
2363 * transactions in the log, which makes normal mount times fast.
2364 *
2365 * After a crash, this starts with the next unflushed transaction, and
2366 * replays until it finds one too old, or invalid.
2367 *
2368 * On exit, it sets things up so the first transaction will work correctly.
2369 * NOTE: only called during fs mount
2370 */
2371static int journal_read(struct super_block *sb)
2372{
2373	struct reiserfs_journal *journal = SB_JOURNAL(sb);
2374	struct reiserfs_journal_desc *desc;
2375	unsigned int oldest_trans_id = 0;
2376	unsigned int oldest_invalid_trans_id = 0;
2377	time64_t start;
2378	unsigned long oldest_start = 0;
2379	unsigned long cur_dblock = 0;
2380	unsigned long newest_mount_id = 9;
2381	struct buffer_head *d_bh;
2382	struct reiserfs_journal_header *jh;
2383	int valid_journal_header = 0;
2384	int replay_count = 0;
2385	int continue_replay = 1;
2386	int ret;
2387
2388	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2389	reiserfs_info(sb, "checking transaction log (%pg)\n",
2390		      journal->j_dev_bd);
2391	start = ktime_get_seconds();
2392
2393	/*
2394	 * step 1, read in the journal header block.  Check the transaction
2395	 * it says is the first unflushed, and if that transaction is not
2396	 * valid, replay is done
2397	 */
2398	journal->j_header_bh = journal_bread(sb,
2399					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
2400					     + SB_ONDISK_JOURNAL_SIZE(sb));
2401	if (!journal->j_header_bh) {
2402		return 1;
2403	}
2404	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
2405	if (le32_to_cpu(jh->j_first_unflushed_offset) <
2406	    SB_ONDISK_JOURNAL_SIZE(sb)
2407	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
2408		oldest_start =
2409		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2410		    le32_to_cpu(jh->j_first_unflushed_offset);
2411		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2412		newest_mount_id = le32_to_cpu(jh->j_mount_id);
2413		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2414			       "journal-1153: found in "
2415			       "header: first_unflushed_offset %d, last_flushed_trans_id "
2416			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
2417			       le32_to_cpu(jh->j_last_flush_trans_id));
2418		valid_journal_header = 1;
2419
2420		/*
2421		 * now, we try to read the first unflushed offset.  If it
2422		 * is not valid, there is nothing more we can do, and it
2423		 * makes no sense to read through the whole log.
2424		 */
2425		d_bh =
2426		    journal_bread(sb,
2427				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2428				  le32_to_cpu(jh->j_first_unflushed_offset));
2429		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
2430		if (!ret) {
2431			continue_replay = 0;
2432		}
2433		brelse(d_bh);
2434		goto start_log_replay;
2435	}
2436
2437	/*
2438	 * ok, there are transactions that need to be replayed.  start
2439	 * with the first log block, find all the valid transactions, and
2440	 * pick out the oldest.
2441	 */
2442	while (continue_replay
2443	       && cur_dblock <
2444	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2445		SB_ONDISK_JOURNAL_SIZE(sb))) {
2446		/*
2447		 * Note that it is required for blocksize of primary fs
2448		 * device and journal device to be the same
2449		 */
2450		d_bh =
2451		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
2452				    sb->s_blocksize,
2453				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2454				    SB_ONDISK_JOURNAL_SIZE(sb));
2455		ret =
2456		    journal_transaction_is_valid(sb, d_bh,
2457						 &oldest_invalid_trans_id,
2458						 &newest_mount_id);
2459		if (ret == 1) {
2460			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2461			if (oldest_start == 0) {	/* init all oldest_ values */
2462				oldest_trans_id = get_desc_trans_id(desc);
2463				oldest_start = d_bh->b_blocknr;
2464				newest_mount_id = get_desc_mount_id(desc);
2465				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2466					       "journal-1179: Setting "
2467					       "oldest_start to offset %llu, trans_id %lu",
2468					       oldest_start -
2469					       SB_ONDISK_JOURNAL_1st_BLOCK
2470					       (sb), oldest_trans_id);
2471			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
2472				/* one we just read was older */
2473				oldest_trans_id = get_desc_trans_id(desc);
2474				oldest_start = d_bh->b_blocknr;
2475				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2476					       "journal-1180: Resetting "
2477					       "oldest_start to offset %lu, trans_id %lu",
2478					       oldest_start -
2479					       SB_ONDISK_JOURNAL_1st_BLOCK
2480					       (sb), oldest_trans_id);
2481			}
2482			if (newest_mount_id < get_desc_mount_id(desc)) {
2483				newest_mount_id = get_desc_mount_id(desc);
2484				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2485					       "journal-1299: Setting "
2486					       "newest_mount_id to %d",
2487					       get_desc_mount_id(desc));
2488			}
2489			cur_dblock += get_desc_trans_len(desc) + 2;
2490		} else {
2491			cur_dblock++;
2492		}
2493		brelse(d_bh);
2494	}
2495
2496start_log_replay:
2497	cur_dblock = oldest_start;
2498	if (oldest_trans_id) {
2499		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2500			       "journal-1206: Starting replay "
2501			       "from offset %llu, trans_id %lu",
2502			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2503			       oldest_trans_id);
2504
2505	}
2506	replay_count = 0;
2507	while (continue_replay && oldest_trans_id > 0) {
2508		ret =
2509		    journal_read_transaction(sb, cur_dblock, oldest_start,
2510					     oldest_trans_id, newest_mount_id);
2511		if (ret < 0) {
2512			return ret;
2513		} else if (ret != 0) {
2514			break;
2515		}
2516		cur_dblock =
2517		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
2518		replay_count++;
2519		if (cur_dblock == oldest_start)
2520			break;
2521	}
2522
2523	if (oldest_trans_id == 0) {
2524		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2525			       "journal-1225: No valid " "transactions found");
2526	}
2527	/*
2528	 * j_start does not get set correctly if we don't replay any
2529	 * transactions.  if we had a valid journal_header, set j_start
2530	 * to the first unflushed transaction value, copy the trans_id
2531	 * from the header
2532	 */
2533	if (valid_journal_header && replay_count == 0) {
2534		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
2535		journal->j_trans_id =
2536		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2537		/* check for trans_id overflow */
2538		if (journal->j_trans_id == 0)
2539			journal->j_trans_id = 10;
2540		journal->j_last_flush_trans_id =
2541		    le32_to_cpu(jh->j_last_flush_trans_id);
2542		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
2543	} else {
2544		journal->j_mount_id = newest_mount_id + 1;
2545	}
2546	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
2547		       "newest_mount_id to %lu", journal->j_mount_id);
2548	journal->j_first_unflushed_offset = journal->j_start;
2549	if (replay_count > 0) {
2550		reiserfs_info(sb,
2551			      "replayed %d transactions in %lu seconds\n",
2552			      replay_count, ktime_get_seconds() - start);
2553	}
2554	/* needed to satisfy the locking in _update_journal_header_block */
2555	reiserfs_write_lock(sb);
2556	if (!bdev_read_only(sb->s_bdev) &&
2557	    _update_journal_header_block(sb, journal->j_start,
2558					 journal->j_last_flush_trans_id)) {
2559		reiserfs_write_unlock(sb);
2560		/*
2561		 * replay failed, caller must call free_journal_ram and abort
2562		 * the mount
2563		 */
2564		return -1;
2565	}
2566	reiserfs_write_unlock(sb);
2567	return 0;
2568}
2569
2570static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
2571{
2572	struct reiserfs_journal_list *jl;
2573	jl = kzalloc(sizeof(struct reiserfs_journal_list),
2574		     GFP_NOFS | __GFP_NOFAIL);
2575	INIT_LIST_HEAD(&jl->j_list);
2576	INIT_LIST_HEAD(&jl->j_working_list);
2577	INIT_LIST_HEAD(&jl->j_tail_bh_list);
2578	INIT_LIST_HEAD(&jl->j_bh_list);
2579	mutex_init(&jl->j_commit_mutex);
2580	SB_JOURNAL(s)->j_num_lists++;
2581	get_journal_list(jl);
2582	return jl;
2583}
2584
2585static void journal_list_init(struct super_block *sb)
2586{
2587	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
2588}
2589
2590static void release_journal_dev(struct super_block *super,
2591			       struct reiserfs_journal *journal)
2592{
2593	if (journal->j_dev_bd != NULL) {
2594		void *holder = NULL;
2595
2596		if (journal->j_dev_bd->bd_dev != super->s_dev)
2597			holder = journal;
2598
2599		blkdev_put(journal->j_dev_bd, holder);
2600		journal->j_dev_bd = NULL;
2601	}
2602}
2603
2604static int journal_init_dev(struct super_block *super,
2605			    struct reiserfs_journal *journal,
2606			    const char *jdev_name)
2607{
2608	blk_mode_t blkdev_mode = BLK_OPEN_READ;
2609	void *holder = journal;
2610	int result;
2611	dev_t jdev;
2612
2613	result = 0;
2614
2615	journal->j_dev_bd = NULL;
2616	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
2617	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
2618
2619	if (!bdev_read_only(super->s_bdev))
2620		blkdev_mode |= BLK_OPEN_WRITE;
2621
2622	/* there is no "jdev" option and journal is on separate device */
2623	if ((!jdev_name || !jdev_name[0])) {
2624		if (jdev == super->s_dev)
2625			holder = NULL;
2626		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder,
2627						      NULL);
2628		if (IS_ERR(journal->j_dev_bd)) {
2629			result = PTR_ERR(journal->j_dev_bd);
2630			journal->j_dev_bd = NULL;
2631			reiserfs_warning(super, "sh-458",
2632					 "cannot init journal device unknown-block(%u,%u): %i",
2633					 MAJOR(jdev), MINOR(jdev), result);
2634			return result;
2635		} else if (jdev != super->s_dev)
2636			set_blocksize(journal->j_dev_bd, super->s_blocksize);
2637
2638		return 0;
2639	}
2640
2641	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder,
2642					       NULL);
2643	if (IS_ERR(journal->j_dev_bd)) {
2644		result = PTR_ERR(journal->j_dev_bd);
2645		journal->j_dev_bd = NULL;
2646		reiserfs_warning(super, "sh-457",
2647				 "journal_init_dev: Cannot open '%s': %i",
2648				 jdev_name, result);
2649		return result;
2650	}
2651
2652	set_blocksize(journal->j_dev_bd, super->s_blocksize);
2653	reiserfs_info(super,
2654		      "journal_init_dev: journal device: %pg\n",
2655		      journal->j_dev_bd);
2656	return 0;
2657}
2658
2659/*
2660 * When creating/tuning a file system user can assign some
2661 * journal params within boundaries which depend on the ratio
2662 * blocksize/standard_blocksize.
2663 *
2664 * For blocks >= standard_blocksize transaction size should
2665 * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
2666 * then JOURNAL_TRANS_MAX_DEFAULT.
2667 *
2668 * For blocks < standard_blocksize these boundaries should be
2669 * decreased proportionally.
2670 */
2671#define REISERFS_STANDARD_BLKSIZE (4096)
2672
2673static int check_advise_trans_params(struct super_block *sb,
2674				     struct reiserfs_journal *journal)
2675{
2676        if (journal->j_trans_max) {
2677		/* Non-default journal params.  Do sanity check for them. */
2678	        int ratio = 1;
2679		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
2680		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
2681
2682		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
2683		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
2684		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
2685		    JOURNAL_MIN_RATIO) {
2686			reiserfs_warning(sb, "sh-462",
2687					 "bad transaction max size (%u). "
2688					 "FSCK?", journal->j_trans_max);
2689			return 1;
2690		}
2691		if (journal->j_max_batch != (journal->j_trans_max) *
2692		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
2693			reiserfs_warning(sb, "sh-463",
2694					 "bad transaction max batch (%u). "
2695					 "FSCK?", journal->j_max_batch);
2696			return 1;
2697		}
2698	} else {
2699		/*
2700		 * Default journal params.
2701		 * The file system was created by old version
2702		 * of mkreiserfs, so some fields contain zeros,
2703		 * and we need to advise proper values for them
2704		 */
2705		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
2706			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
2707					 sb->s_blocksize);
2708			return 1;
2709		}
2710		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
2711		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
2712		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
2713	}
2714	return 0;
2715}
2716
2717/* must be called once on fs mount.  calls journal_read for you */
2718int journal_init(struct super_block *sb, const char *j_dev_name,
2719		 int old_format, unsigned int commit_max_age)
2720{
2721	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
2722	struct buffer_head *bhjh;
2723	struct reiserfs_super_block *rs;
2724	struct reiserfs_journal_header *jh;
2725	struct reiserfs_journal *journal;
2726	struct reiserfs_journal_list *jl;
2727	int ret;
2728
2729	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
2730	if (!journal) {
2731		reiserfs_warning(sb, "journal-1256",
2732				 "unable to get memory for journal structure");
2733		return 1;
2734	}
2735	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
2736	INIT_LIST_HEAD(&journal->j_prealloc_list);
2737	INIT_LIST_HEAD(&journal->j_working_list);
2738	INIT_LIST_HEAD(&journal->j_journal_list);
2739	journal->j_persistent_trans = 0;
2740	if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
2741					   reiserfs_bmap_count(sb)))
2742		goto free_and_return;
2743
2744	allocate_bitmap_nodes(sb);
2745
2746	/* reserved for journal area support */
2747	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
2748						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
2749						 / sb->s_blocksize +
2750						 reiserfs_bmap_count(sb) +
2751						 1 :
2752						 REISERFS_DISK_OFFSET_IN_BYTES /
2753						 sb->s_blocksize + 2);
2754
2755	/*
2756	 * Sanity check to see is the standard journal fitting
2757	 * within first bitmap (actual for small blocksizes)
2758	 */
2759	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
2760	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
2761	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
2762		reiserfs_warning(sb, "journal-1393",
2763				 "journal does not fit for area addressed "
2764				 "by first of bitmap blocks. It starts at "
2765				 "%u and its size is %u. Block size %ld",
2766				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
2767				 SB_ONDISK_JOURNAL_SIZE(sb),
2768				 sb->s_blocksize);
2769		goto free_and_return;
2770	}
2771
2772	/*
2773	 * Sanity check to see if journal first block is correct.
2774	 * If journal first block is invalid it can cause
2775	 * zeroing important superblock members.
2776	 */
2777	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
2778	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
2779		reiserfs_warning(sb, "journal-1393",
2780				 "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
2781				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
2782				 SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2783		goto free_and_return;
2784	}
2785
2786	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2787		reiserfs_warning(sb, "sh-462",
2788				 "unable to initialize journal device");
2789		goto free_and_return;
2790	}
2791
2792	rs = SB_DISK_SUPER_BLOCK(sb);
2793
2794	/* read journal header */
2795	bhjh = journal_bread(sb,
2796			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2797			     SB_ONDISK_JOURNAL_SIZE(sb));
2798	if (!bhjh) {
2799		reiserfs_warning(sb, "sh-459",
2800				 "unable to read journal header");
2801		goto free_and_return;
2802	}
2803	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
2804
2805	/* make sure that journal matches to the super block */
2806	if (is_reiserfs_jr(rs)
2807	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
2808		sb_jp_journal_magic(rs))) {
2809		reiserfs_warning(sb, "sh-460",
2810				 "journal header magic %x (device %pg) does "
2811				 "not match to magic found in super block %x",
2812				 jh->jh_journal.jp_journal_magic,
2813				 journal->j_dev_bd,
2814				 sb_jp_journal_magic(rs));
2815		brelse(bhjh);
2816		goto free_and_return;
2817	}
2818
2819	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
2820	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
2821	journal->j_max_commit_age =
2822	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
2823	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
2824
2825	if (check_advise_trans_params(sb, journal) != 0)
2826	        goto free_and_return;
2827	journal->j_default_max_commit_age = journal->j_max_commit_age;
2828
2829	if (commit_max_age != 0) {
2830		journal->j_max_commit_age = commit_max_age;
2831		journal->j_max_trans_age = commit_max_age;
2832	}
2833
2834	reiserfs_info(sb, "journal params: device %pg, size %u, "
2835		      "journal first block %u, max trans len %u, max batch %u, "
2836		      "max commit age %u, max trans age %u\n",
2837		      journal->j_dev_bd,
2838		      SB_ONDISK_JOURNAL_SIZE(sb),
2839		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2840		      journal->j_trans_max,
2841		      journal->j_max_batch,
2842		      journal->j_max_commit_age, journal->j_max_trans_age);
2843
2844	brelse(bhjh);
2845
2846	journal->j_list_bitmap_index = 0;
2847	journal_list_init(sb);
2848
2849	memset(journal->j_list_hash_table, 0,
2850	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
2851
2852	INIT_LIST_HEAD(&journal->j_dirty_buffers);
2853	spin_lock_init(&journal->j_dirty_buffers_lock);
2854
2855	journal->j_start = 0;
2856	journal->j_len = 0;
2857	journal->j_len_alloc = 0;
2858	atomic_set(&journal->j_wcount, 0);
2859	atomic_set(&journal->j_async_throttle, 0);
2860	journal->j_bcount = 0;
2861	journal->j_trans_start_time = 0;
2862	journal->j_last = NULL;
2863	journal->j_first = NULL;
2864	init_waitqueue_head(&journal->j_join_wait);
2865	mutex_init(&journal->j_mutex);
2866	mutex_init(&journal->j_flush_mutex);
2867
2868	journal->j_trans_id = 10;
2869	journal->j_mount_id = 10;
2870	journal->j_state = 0;
2871	atomic_set(&journal->j_jlock, 0);
2872	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
2873	journal->j_cnode_free_orig = journal->j_cnode_free_list;
2874	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
2875	journal->j_cnode_used = 0;
2876	journal->j_must_wait = 0;
2877
2878	if (journal->j_cnode_free == 0) {
2879		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
2880		                 "allocation failed (%ld bytes). Journal is "
2881		                 "too large for available memory. Usually "
2882		                 "this is due to a journal that is too large.",
2883		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
2884        	goto free_and_return;
2885	}
2886
2887	init_journal_hash(sb);
2888	jl = journal->j_current_jl;
2889
2890	/*
2891	 * get_list_bitmap() may call flush_commit_list() which
2892	 * requires the lock. Calling flush_commit_list() shouldn't happen
2893	 * this early but I like to be paranoid.
2894	 */
2895	reiserfs_write_lock(sb);
2896	jl->j_list_bitmap = get_list_bitmap(sb, jl);
2897	reiserfs_write_unlock(sb);
2898	if (!jl->j_list_bitmap) {
2899		reiserfs_warning(sb, "journal-2005",
2900				 "get_list_bitmap failed for journal list 0");
2901		goto free_and_return;
2902	}
2903
2904	ret = journal_read(sb);
2905	if (ret < 0) {
2906		reiserfs_warning(sb, "reiserfs-2006",
2907				 "Replay Failure, unable to mount");
2908		goto free_and_return;
2909	}
2910
2911	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2912	journal->j_work_sb = sb;
2913	return 0;
2914free_and_return:
2915	free_journal_ram(sb);
2916	return 1;
2917}
2918
2919/*
2920 * test for a polite end of the current transaction.  Used by file_write,
2921 * and should be used by delete to make sure they don't write more than
2922 * can fit inside a single transaction
2923 */
2924int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
2925				   int new_alloc)
2926{
2927	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
2928	time64_t now = ktime_get_seconds();
2929	/* cannot restart while nested */
2930	BUG_ON(!th->t_trans_id);
2931	if (th->t_refcount > 1)
2932		return 0;
2933	if (journal->j_must_wait > 0 ||
2934	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
2935	    atomic_read(&journal->j_jlock) ||
2936	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
2937	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
2938		return 1;
2939	}
2940
2941	journal->j_len_alloc += new_alloc;
2942	th->t_blocks_allocated += new_alloc ;
2943	return 0;
2944}
2945
2946/* this must be called inside a transaction */
2947void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
2948{
2949	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
2950	BUG_ON(!th->t_trans_id);
2951	journal->j_must_wait = 1;
2952	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
2953	return;
2954}
2955
2956/* this must be called without a transaction started */
2957void reiserfs_allow_writes(struct super_block *s)
2958{
2959	struct reiserfs_journal *journal = SB_JOURNAL(s);
2960	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
2961	wake_up(&journal->j_join_wait);
2962}
2963
2964/* this must be called without a transaction started */
2965void reiserfs_wait_on_write_block(struct super_block *s)
2966{
2967	struct reiserfs_journal *journal = SB_JOURNAL(s);
2968	wait_event(journal->j_join_wait,
2969		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
2970}
2971
2972static void queue_log_writer(struct super_block *s)
2973{
2974	wait_queue_entry_t wait;
2975	struct reiserfs_journal *journal = SB_JOURNAL(s);
2976	set_bit(J_WRITERS_QUEUED, &journal->j_state);
2977
2978	/*
2979	 * we don't want to use wait_event here because
2980	 * we only want to wait once.
2981	 */
2982	init_waitqueue_entry(&wait, current);
2983	add_wait_queue(&journal->j_join_wait, &wait);
2984	set_current_state(TASK_UNINTERRUPTIBLE);
2985	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
2986		int depth = reiserfs_write_unlock_nested(s);
2987		schedule();
2988		reiserfs_write_lock_nested(s, depth);
2989	}
2990	__set_current_state(TASK_RUNNING);
2991	remove_wait_queue(&journal->j_join_wait, &wait);
2992}
2993
2994static void wake_queued_writers(struct super_block *s)
2995{
2996	struct reiserfs_journal *journal = SB_JOURNAL(s);
2997	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
2998		wake_up(&journal->j_join_wait);
2999}
3000
3001static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
3002{
3003	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3004	unsigned long bcount = journal->j_bcount;
3005	while (1) {
3006		int depth;
3007
3008		depth = reiserfs_write_unlock_nested(sb);
3009		schedule_timeout_uninterruptible(1);
3010		reiserfs_write_lock_nested(sb, depth);
3011
3012		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
3013		while ((atomic_read(&journal->j_wcount) > 0 ||
3014			atomic_read(&journal->j_jlock)) &&
3015		       journal->j_trans_id == trans_id) {
3016			queue_log_writer(sb);
3017		}
3018		if (journal->j_trans_id != trans_id)
3019			break;
3020		if (bcount == journal->j_bcount)
3021			break;
3022		bcount = journal->j_bcount;
3023	}
3024}
3025
3026/*
3027 * join == true if you must join an existing transaction.
3028 * join == false if you can deal with waiting for others to finish
3029 *
3030 * this will block until the transaction is joinable.  send the number of
3031 * blocks you expect to use in nblocks.
3032*/
3033static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3034			      struct super_block *sb, unsigned long nblocks,
3035			      int join)
3036{
3037	time64_t now = ktime_get_seconds();
3038	unsigned int old_trans_id;
3039	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3040	struct reiserfs_transaction_handle myth;
3041	int retval;
3042	int depth;
3043
3044	reiserfs_check_lock_depth(sb, "journal_begin");
3045	BUG_ON(nblocks > journal->j_trans_max);
3046
3047	PROC_INFO_INC(sb, journal.journal_being);
3048	/* set here for journal_join */
3049	th->t_refcount = 1;
3050	th->t_super = sb;
3051
3052relock:
3053	lock_journal(sb);
3054	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
3055		unlock_journal(sb);
3056		retval = journal->j_errno;
3057		goto out_fail;
3058	}
3059	journal->j_bcount++;
3060
3061	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3062		unlock_journal(sb);
3063		depth = reiserfs_write_unlock_nested(sb);
3064		reiserfs_wait_on_write_block(sb);
3065		reiserfs_write_lock_nested(sb, depth);
3066		PROC_INFO_INC(sb, journal.journal_relock_writers);
3067		goto relock;
3068	}
3069	now = ktime_get_seconds();
3070
3071	/*
3072	 * if there is no room in the journal OR
3073	 * if this transaction is too old, and we weren't called joinable,
3074	 * wait for it to finish before beginning we don't sleep if there
3075	 * aren't other writers
3076	 */
3077
3078	if ((!join && journal->j_must_wait > 0) ||
3079	    (!join
3080	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
3081	    || (!join && atomic_read(&journal->j_wcount) > 0
3082		&& journal->j_trans_start_time > 0
3083		&& (now - journal->j_trans_start_time) >
3084		journal->j_max_trans_age) || (!join
3085					      && atomic_read(&journal->j_jlock))
3086	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
3087
3088		old_trans_id = journal->j_trans_id;
3089		/* allow others to finish this transaction */
3090		unlock_journal(sb);
3091
3092		if (!join && (journal->j_len_alloc + nblocks + 2) >=
3093		    journal->j_max_batch &&
3094		    ((journal->j_len + nblocks + 2) * 100) <
3095		    (journal->j_len_alloc * 75)) {
3096			if (atomic_read(&journal->j_wcount) > 10) {
3097				queue_log_writer(sb);
3098				goto relock;
3099			}
3100		}
3101		/*
3102		 * don't mess with joining the transaction if all we
3103		 * have to do is wait for someone else to do a commit
3104		 */
3105		if (atomic_read(&journal->j_jlock)) {
3106			while (journal->j_trans_id == old_trans_id &&
3107			       atomic_read(&journal->j_jlock)) {
3108				queue_log_writer(sb);
3109			}
3110			goto relock;
3111		}
3112		retval = journal_join(&myth, sb);
3113		if (retval)
3114			goto out_fail;
3115
3116		/* someone might have ended the transaction while we joined */
3117		if (old_trans_id != journal->j_trans_id) {
3118			retval = do_journal_end(&myth, 0);
3119		} else {
3120			retval = do_journal_end(&myth, COMMIT_NOW);
3121		}
3122
3123		if (retval)
3124			goto out_fail;
3125
3126		PROC_INFO_INC(sb, journal.journal_relock_wcount);
3127		goto relock;
3128	}
3129	/* we are the first writer, set trans_id */
3130	if (journal->j_trans_start_time == 0) {
3131		journal->j_trans_start_time = ktime_get_seconds();
3132	}
3133	atomic_inc(&journal->j_wcount);
3134	journal->j_len_alloc += nblocks;
3135	th->t_blocks_logged = 0;
3136	th->t_blocks_allocated = nblocks;
3137	th->t_trans_id = journal->j_trans_id;
3138	unlock_journal(sb);
3139	INIT_LIST_HEAD(&th->t_list);
3140	return 0;
3141
3142out_fail:
3143	memset(th, 0, sizeof(*th));
3144	/*
3145	 * Re-set th->t_super, so we can properly keep track of how many
3146	 * persistent transactions there are. We need to do this so if this
3147	 * call is part of a failed restart_transaction, we can free it later
3148	 */
3149	th->t_super = sb;
3150	return retval;
3151}
3152
3153struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
3154								    super_block
3155								    *s,
3156								    int nblocks)
3157{
3158	int ret;
3159	struct reiserfs_transaction_handle *th;
3160
3161	/*
3162	 * if we're nesting into an existing transaction.  It will be
3163	 * persistent on its own
3164	 */
3165	if (reiserfs_transaction_running(s)) {
3166		th = current->journal_info;
3167		th->t_refcount++;
3168		BUG_ON(th->t_refcount < 2);
3169
3170		return th;
3171	}
3172	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
3173	if (!th)
3174		return NULL;
3175	ret = journal_begin(th, s, nblocks);
3176	if (ret) {
3177		kfree(th);
3178		return NULL;
3179	}
3180
3181	SB_JOURNAL(s)->j_persistent_trans++;
3182	return th;
3183}
3184
3185int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
3186{
3187	struct super_block *s = th->t_super;
3188	int ret = 0;
3189	if (th->t_trans_id)
3190		ret = journal_end(th);
3191	else
3192		ret = -EIO;
3193	if (th->t_refcount == 0) {
3194		SB_JOURNAL(s)->j_persistent_trans--;
3195		kfree(th);
3196	}
3197	return ret;
3198}
3199
3200static int journal_join(struct reiserfs_transaction_handle *th,
3201			struct super_block *sb)
3202{
3203	struct reiserfs_transaction_handle *cur_th = current->journal_info;
3204
3205	/*
3206	 * this keeps do_journal_end from NULLing out the
3207	 * current->journal_info pointer
3208	 */
3209	th->t_handle_save = cur_th;
3210	BUG_ON(cur_th && cur_th->t_refcount > 1);
3211	return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
3212}
3213
3214int journal_join_abort(struct reiserfs_transaction_handle *th,
3215		       struct super_block *sb)
3216{
3217	struct reiserfs_transaction_handle *cur_th = current->journal_info;
3218
3219	/*
3220	 * this keeps do_journal_end from NULLing out the
3221	 * current->journal_info pointer
3222	 */
3223	th->t_handle_save = cur_th;
3224	BUG_ON(cur_th && cur_th->t_refcount > 1);
3225	return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
3226}
3227
3228int journal_begin(struct reiserfs_transaction_handle *th,
3229		  struct super_block *sb, unsigned long nblocks)
3230{
3231	struct reiserfs_transaction_handle *cur_th = current->journal_info;
3232	int ret;
3233
3234	th->t_handle_save = NULL;
3235	if (cur_th) {
3236		/* we are nesting into the current transaction */
3237		if (cur_th->t_super == sb) {
3238			BUG_ON(!cur_th->t_refcount);
3239			cur_th->t_refcount++;
3240			memcpy(th, cur_th, sizeof(*th));
3241			if (th->t_refcount <= 1)
3242				reiserfs_warning(sb, "reiserfs-2005",
3243						 "BAD: refcount <= 1, but "
3244						 "journal_info != 0");
3245			return 0;
3246		} else {
3247			/*
3248			 * we've ended up with a handle from a different
3249			 * filesystem.  save it and restore on journal_end.
3250			 * This should never really happen...
3251			 */
3252			reiserfs_warning(sb, "clm-2100",
3253					 "nesting info a different FS");
3254			th->t_handle_save = current->journal_info;
3255			current->journal_info = th;
3256		}
3257	} else {
3258		current->journal_info = th;
3259	}
3260	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
3261	BUG_ON(current->journal_info != th);
3262
3263	/*
3264	 * I guess this boils down to being the reciprocal of clm-2100 above.
3265	 * If do_journal_begin_r fails, we need to put it back, since
3266	 * journal_end won't be called to do it. */
3267	if (ret)
3268		current->journal_info = th->t_handle_save;
3269	else
3270		BUG_ON(!th->t_refcount);
3271
3272	return ret;
3273}
3274
3275/*
3276 * puts bh into the current transaction.  If it was already there, reorders
3277 * removes the old pointers from the hash, and puts new ones in (to make
3278 * sure replay happen in the right order).
3279 *
3280 * if it was dirty, cleans and files onto the clean list.  I can't let it
3281 * be dirty again until the transaction is committed.
3282 *
3283 * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
3284 */
3285int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3286		       struct buffer_head *bh)
3287{
3288	struct super_block *sb = th->t_super;
3289	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3290	struct reiserfs_journal_cnode *cn = NULL;
3291	int count_already_incd = 0;
3292	int prepared = 0;
3293	BUG_ON(!th->t_trans_id);
3294
3295	PROC_INFO_INC(sb, journal.mark_dirty);
3296	if (th->t_trans_id != journal->j_trans_id) {
3297		reiserfs_panic(th->t_super, "journal-1577",
3298			       "handle trans id %ld != current trans id %ld",
3299			       th->t_trans_id, journal->j_trans_id);
3300	}
3301
3302	prepared = test_clear_buffer_journal_prepared(bh);
3303	clear_buffer_journal_restore_dirty(bh);
3304	/* already in this transaction, we are done */
3305	if (buffer_journaled(bh)) {
3306		PROC_INFO_INC(sb, journal.mark_dirty_already);
3307		return 0;
3308	}
3309
3310	/*
3311	 * this must be turned into a panic instead of a warning.  We can't
3312	 * allow a dirty or journal_dirty or locked buffer to be logged, as
3313	 * some changes could get to disk too early.  NOT GOOD.
3314	 */
3315	if (!prepared || buffer_dirty(bh)) {
3316		reiserfs_warning(sb, "journal-1777",
3317				 "buffer %llu bad state "
3318				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
3319				 (unsigned long long)bh->b_blocknr,
3320				 prepared ? ' ' : '!',
3321				 buffer_locked(bh) ? ' ' : '!',
3322				 buffer_dirty(bh) ? ' ' : '!',
3323				 buffer_journal_dirty(bh) ? ' ' : '!');
3324	}
3325
3326	if (atomic_read(&journal->j_wcount) <= 0) {
3327		reiserfs_warning(sb, "journal-1409",
3328				 "returning because j_wcount was %d",
3329				 atomic_read(&journal->j_wcount));
3330		return 1;
3331	}
3332	/*
3333	 * this error means I've screwed up, and we've overflowed
3334	 * the transaction.  Nothing can be done here, except make the
3335	 * FS readonly or panic.
3336	 */
3337	if (journal->j_len >= journal->j_trans_max) {
3338		reiserfs_panic(th->t_super, "journal-1413",
3339			       "j_len (%lu) is too big",
3340			       journal->j_len);
3341	}
3342
3343	if (buffer_journal_dirty(bh)) {
3344		count_already_incd = 1;
3345		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
3346		clear_buffer_journal_dirty(bh);
3347	}
3348
3349	if (journal->j_len > journal->j_len_alloc) {
3350		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
3351	}
3352
3353	set_buffer_journaled(bh);
3354
3355	/* now put this guy on the end */
3356	if (!cn) {
3357		cn = get_cnode(sb);
3358		if (!cn) {
3359			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
3360		}
3361
3362		if (th->t_blocks_logged == th->t_blocks_allocated) {
3363			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
3364			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
3365		}
3366		th->t_blocks_logged++;
3367		journal->j_len++;
3368
3369		cn->bh = bh;
3370		cn->blocknr = bh->b_blocknr;
3371		cn->sb = sb;
3372		cn->jlist = NULL;
3373		insert_journal_hash(journal->j_hash_table, cn);
3374		if (!count_already_incd) {
3375			get_bh(bh);
3376		}
3377	}
3378	cn->next = NULL;
3379	cn->prev = journal->j_last;
3380	cn->bh = bh;
3381	if (journal->j_last) {
3382		journal->j_last->next = cn;
3383		journal->j_last = cn;
3384	} else {
3385		journal->j_first = cn;
3386		journal->j_last = cn;
3387	}
3388	reiserfs_schedule_old_flush(sb);
3389	return 0;
3390}
3391
3392int journal_end(struct reiserfs_transaction_handle *th)
3393{
3394	struct super_block *sb = th->t_super;
3395	if (!current->journal_info && th->t_refcount > 1)
3396		reiserfs_warning(sb, "REISER-NESTING",
3397				 "th NULL, refcount %d", th->t_refcount);
3398
3399	if (!th->t_trans_id) {
3400		WARN_ON(1);
3401		return -EIO;
3402	}
3403
3404	th->t_refcount--;
3405	if (th->t_refcount > 0) {
3406		struct reiserfs_transaction_handle *cur_th =
3407		    current->journal_info;
3408
3409		/*
3410		 * we aren't allowed to close a nested transaction on a
3411		 * different filesystem from the one in the task struct
3412		 */
3413		BUG_ON(cur_th->t_super != th->t_super);
3414
3415		if (th != cur_th) {
3416			memcpy(current->journal_info, th, sizeof(*th));
3417			th->t_trans_id = 0;
3418		}
3419		return 0;
3420	} else {
3421		return do_journal_end(th, 0);
3422	}
3423}
3424
3425/*
3426 * removes from the current transaction, relsing and descrementing any counters.
3427 * also files the removed buffer directly onto the clean list
3428 *
3429 * called by journal_mark_freed when a block has been deleted
3430 *
3431 * returns 1 if it cleaned and relsed the buffer. 0 otherwise
3432 */
3433static int remove_from_transaction(struct super_block *sb,
3434				   b_blocknr_t blocknr, int already_cleaned)
3435{
3436	struct buffer_head *bh;
3437	struct reiserfs_journal_cnode *cn;
3438	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3439	int ret = 0;
3440
3441	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3442	if (!cn || !cn->bh) {
3443		return ret;
3444	}
3445	bh = cn->bh;
3446	if (cn->prev) {
3447		cn->prev->next = cn->next;
3448	}
3449	if (cn->next) {
3450		cn->next->prev = cn->prev;
3451	}
3452	if (cn == journal->j_first) {
3453		journal->j_first = cn->next;
3454	}
3455	if (cn == journal->j_last) {
3456		journal->j_last = cn->prev;
3457	}
3458	remove_journal_hash(sb, journal->j_hash_table, NULL,
3459			    bh->b_blocknr, 0);
3460	clear_buffer_journaled(bh);	/* don't log this one */
3461
3462	if (!already_cleaned) {
3463		clear_buffer_journal_dirty(bh);
3464		clear_buffer_dirty(bh);
3465		clear_buffer_journal_test(bh);
3466		put_bh(bh);
3467		if (atomic_read(&bh->b_count) < 0) {
3468			reiserfs_warning(sb, "journal-1752",
3469					 "b_count < 0");
3470		}
3471		ret = 1;
3472	}
3473	journal->j_len--;
3474	journal->j_len_alloc--;
3475	free_cnode(sb, cn);
3476	return ret;
3477}
3478
3479/*
3480 * for any cnode in a journal list, it can only be dirtied of all the
3481 * transactions that include it are committed to disk.
3482 * this checks through each transaction, and returns 1 if you are allowed
3483 * to dirty, and 0 if you aren't
3484 *
3485 * it is called by dirty_journal_list, which is called after
3486 * flush_commit_list has gotten all the log blocks for a given
3487 * transaction on disk
3488 *
3489 */
3490static int can_dirty(struct reiserfs_journal_cnode *cn)
3491{
3492	struct super_block *sb = cn->sb;
3493	b_blocknr_t blocknr = cn->blocknr;
3494	struct reiserfs_journal_cnode *cur = cn->hprev;
3495	int can_dirty = 1;
3496
3497	/*
3498	 * first test hprev.  These are all newer than cn, so any node here
3499	 * with the same block number and dev means this node can't be sent
3500	 * to disk right now.
3501	 */
3502	while (cur && can_dirty) {
3503		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
3504		    cur->blocknr == blocknr) {
3505			can_dirty = 0;
3506		}
3507		cur = cur->hprev;
3508	}
3509	/*
3510	 * then test hnext.  These are all older than cn.  As long as they
3511	 * are committed to the log, it is safe to write cn to disk
3512	 */
3513	cur = cn->hnext;
3514	while (cur && can_dirty) {
3515		if (cur->jlist && cur->jlist->j_len > 0 &&
3516		    atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
3517		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
3518			can_dirty = 0;
3519		}
3520		cur = cur->hnext;
3521	}
3522	return can_dirty;
3523}
3524
3525/*
3526 * syncs the commit blocks, but does not force the real buffers to disk
3527 * will wait until the current transaction is done/committed before returning
3528 */
3529int journal_end_sync(struct reiserfs_transaction_handle *th)
3530{
3531	struct super_block *sb = th->t_super;
3532	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3533
3534	BUG_ON(!th->t_trans_id);
3535	/* you can sync while nested, very, very bad */
3536	BUG_ON(th->t_refcount > 1);
3537	if (journal->j_len == 0) {
3538		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3539					     1);
3540		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
3541	}
3542	return do_journal_end(th, COMMIT_NOW | WAIT);
3543}
3544
3545/* writeback the pending async commits to disk */
3546static void flush_async_commits(struct work_struct *work)
3547{
3548	struct reiserfs_journal *journal =
3549		container_of(work, struct reiserfs_journal, j_work.work);
3550	struct super_block *sb = journal->j_work_sb;
3551	struct reiserfs_journal_list *jl;
3552	struct list_head *entry;
3553
3554	reiserfs_write_lock(sb);
3555	if (!list_empty(&journal->j_journal_list)) {
3556		/* last entry is the youngest, commit it and you get everything */
3557		entry = journal->j_journal_list.prev;
3558		jl = JOURNAL_LIST_ENTRY(entry);
3559		flush_commit_list(sb, jl, 1);
3560	}
3561	reiserfs_write_unlock(sb);
3562}
3563
3564/*
3565 * flushes any old transactions to disk
3566 * ends the current transaction if it is too old
3567 */
3568void reiserfs_flush_old_commits(struct super_block *sb)
3569{
3570	time64_t now;
3571	struct reiserfs_transaction_handle th;
3572	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3573
3574	now = ktime_get_seconds();
3575	/*
3576	 * safety check so we don't flush while we are replaying the log during
3577	 * mount
3578	 */
3579	if (list_empty(&journal->j_journal_list))
3580		return;
3581
3582	/*
3583	 * check the current transaction.  If there are no writers, and it is
3584	 * too old, finish it, and force the commit blocks to disk
3585	 */
3586	if (atomic_read(&journal->j_wcount) <= 0 &&
3587	    journal->j_trans_start_time > 0 &&
3588	    journal->j_len > 0 &&
3589	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3590		if (!journal_join(&th, sb)) {
3591			reiserfs_prepare_for_journal(sb,
3592						     SB_BUFFER_WITH_SB(sb),
3593						     1);
3594			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
3595
3596			/*
3597			 * we're only being called from kreiserfsd, it makes
3598			 * no sense to do an async commit so that kreiserfsd
3599			 * can do it later
3600			 */
3601			do_journal_end(&th, COMMIT_NOW | WAIT);
3602		}
3603	}
3604}
3605
3606/*
3607 * returns 0 if do_journal_end should return right away, returns 1 if
3608 * do_journal_end should finish the commit
3609 *
3610 * if the current transaction is too old, but still has writers, this will
3611 * wait on j_join_wait until all the writers are done.  By the time it
3612 * wakes up, the transaction it was called has already ended, so it just
3613 * flushes the commit list and returns 0.
3614 *
3615 * Won't batch when flush or commit_now is set.  Also won't batch when
3616 * others are waiting on j_join_wait.
3617 *
3618 * Note, we can't allow the journal_end to proceed while there are still
3619 * writers in the log.
3620 */
3621static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
3622{
3623
3624	time64_t now;
3625	int flush = flags & FLUSH_ALL;
3626	int commit_now = flags & COMMIT_NOW;
3627	int wait_on_commit = flags & WAIT;
3628	struct reiserfs_journal_list *jl;
3629	struct super_block *sb = th->t_super;
3630	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3631
3632	BUG_ON(!th->t_trans_id);
3633
3634	if (th->t_trans_id != journal->j_trans_id) {
3635		reiserfs_panic(th->t_super, "journal-1577",
3636			       "handle trans id %ld != current trans id %ld",
3637			       th->t_trans_id, journal->j_trans_id);
3638	}
3639
3640	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
3641	/* <= 0 is allowed.  unmounting might not call begin */
3642	if (atomic_read(&journal->j_wcount) > 0)
3643		atomic_dec(&journal->j_wcount);
3644
3645	/*
3646	 * BUG, deal with case where j_len is 0, but people previously
3647	 * freed blocks need to be released will be dealt with by next
3648	 * transaction that actually writes something, but should be taken
3649	 * care of in this trans
3650	 */
3651	BUG_ON(journal->j_len == 0);
3652
3653	/*
3654	 * if wcount > 0, and we are called to with flush or commit_now,
3655	 * we wait on j_join_wait.  We will wake up when the last writer has
3656	 * finished the transaction, and started it on its way to the disk.
3657	 * Then, we flush the commit or journal list, and just return 0
3658	 * because the rest of journal end was already done for this
3659	 * transaction.
3660	 */
3661	if (atomic_read(&journal->j_wcount) > 0) {
3662		if (flush || commit_now) {
3663			unsigned trans_id;
3664
3665			jl = journal->j_current_jl;
3666			trans_id = jl->j_trans_id;
3667			if (wait_on_commit)
3668				jl->j_state |= LIST_COMMIT_PENDING;
3669			atomic_set(&journal->j_jlock, 1);
3670			if (flush) {
3671				journal->j_next_full_flush = 1;
3672			}
3673			unlock_journal(sb);
3674
3675			/*
3676			 * sleep while the current transaction is
3677			 * still j_jlocked
3678			 */
3679			while (journal->j_trans_id == trans_id) {
3680				if (atomic_read(&journal->j_jlock)) {
3681					queue_log_writer(sb);
3682				} else {
3683					lock_journal(sb);
3684					if (journal->j_trans_id == trans_id) {
3685						atomic_set(&journal->j_jlock,
3686							   1);
3687					}
3688					unlock_journal(sb);
3689				}
3690			}
3691			BUG_ON(journal->j_trans_id == trans_id);
3692
3693			if (commit_now
3694			    && journal_list_still_alive(sb, trans_id)
3695			    && wait_on_commit) {
3696				flush_commit_list(sb, jl, 1);
3697			}
3698			return 0;
3699		}
3700		unlock_journal(sb);
3701		return 0;
3702	}
3703
3704	/* deal with old transactions where we are the last writers */
3705	now = ktime_get_seconds();
3706	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3707		commit_now = 1;
3708		journal->j_next_async_flush = 1;
3709	}
3710	/* don't batch when someone is waiting on j_join_wait */
3711	/* don't batch when syncing the commit or flushing the whole trans */
3712	if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
3713	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
3714	    && journal->j_len_alloc < journal->j_max_batch
3715	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
3716		journal->j_bcount++;
3717		unlock_journal(sb);
3718		return 0;
3719	}
3720
3721	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
3722		reiserfs_panic(sb, "journal-003",
3723			       "j_start (%ld) is too high",
3724			       journal->j_start);
3725	}
3726	return 1;
3727}
3728
3729/*
3730 * Does all the work that makes deleting blocks safe.
3731 * when deleting a block mark BH_JNew, just remove it from the current
3732 * transaction, clean it's buffer_head and move on.
3733 *
3734 * otherwise:
3735 * set a bit for the block in the journal bitmap.  That will prevent it from
3736 * being allocated for unformatted nodes before this transaction has finished.
3737 *
3738 * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
3739 * That will prevent any old transactions with this block from trying to flush
3740 * to the real location.  Since we aren't removing the cnode from the
3741 * journal_list_hash, *the block can't be reallocated yet.
3742 *
3743 * Then remove it from the current transaction, decrementing any counters and
3744 * filing it on the clean list.
3745 */
3746int journal_mark_freed(struct reiserfs_transaction_handle *th,
3747		       struct super_block *sb, b_blocknr_t blocknr)
3748{
3749	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3750	struct reiserfs_journal_cnode *cn = NULL;
3751	struct buffer_head *bh = NULL;
3752	struct reiserfs_list_bitmap *jb = NULL;
3753	int cleaned = 0;
3754	BUG_ON(!th->t_trans_id);
3755
3756	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3757	if (cn && cn->bh) {
3758		bh = cn->bh;
3759		get_bh(bh);
3760	}
3761	/* if it is journal new, we just remove it from this transaction */
3762	if (bh && buffer_journal_new(bh)) {
3763		clear_buffer_journal_new(bh);
3764		clear_prepared_bits(bh);
3765		reiserfs_clean_and_file_buffer(bh);
3766		cleaned = remove_from_transaction(sb, blocknr, cleaned);
3767	} else {
3768		/*
3769		 * set the bit for this block in the journal bitmap
3770		 * for this transaction
3771		 */
3772		jb = journal->j_current_jl->j_list_bitmap;
3773		if (!jb) {
3774			reiserfs_panic(sb, "journal-1702",
3775				       "journal_list_bitmap is NULL");
3776		}
3777		set_bit_in_list_bitmap(sb, blocknr, jb);
3778
3779		/* Note, the entire while loop is not allowed to schedule.  */
3780
3781		if (bh) {
3782			clear_prepared_bits(bh);
3783			reiserfs_clean_and_file_buffer(bh);
3784		}
3785		cleaned = remove_from_transaction(sb, blocknr, cleaned);
3786
3787		/*
3788		 * find all older transactions with this block,
3789		 * make sure they don't try to write it out
3790		 */
3791		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
3792					  blocknr);
3793		while (cn) {
3794			if (sb == cn->sb && blocknr == cn->blocknr) {
3795				set_bit(BLOCK_FREED, &cn->state);
3796				if (cn->bh) {
3797					/*
3798					 * remove_from_transaction will brelse
3799					 * the buffer if it was in the current
3800					 * trans
3801					 */
3802					if (!cleaned) {
3803						clear_buffer_journal_dirty(cn->
3804									   bh);
3805						clear_buffer_dirty(cn->bh);
3806						clear_buffer_journal_test(cn->
3807									  bh);
3808						cleaned = 1;
3809						put_bh(cn->bh);
3810						if (atomic_read
3811						    (&cn->bh->b_count) < 0) {
3812							reiserfs_warning(sb,
3813								 "journal-2138",
3814								 "cn->bh->b_count < 0");
3815						}
3816					}
3817					/*
3818					 * since we are clearing the bh,
3819					 * we MUST dec nonzerolen
3820					 */
3821					if (cn->jlist) {
3822						atomic_dec(&cn->jlist->
3823							   j_nonzerolen);
3824					}
3825					cn->bh = NULL;
3826				}
3827			}
3828			cn = cn->hnext;
3829		}
3830	}
3831
3832	if (bh)
3833		release_buffer_page(bh); /* get_hash grabs the buffer */
3834	return 0;
3835}
3836
3837void reiserfs_update_inode_transaction(struct inode *inode)
3838{
3839	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
3840	REISERFS_I(inode)->i_jl = journal->j_current_jl;
3841	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
3842}
3843
3844/*
3845 * returns -1 on error, 0 if no commits/barriers were done and 1
3846 * if a transaction was actually committed and the barrier was done
3847 */
3848static int __commit_trans_jl(struct inode *inode, unsigned long id,
3849			     struct reiserfs_journal_list *jl)
3850{
3851	struct reiserfs_transaction_handle th;
3852	struct super_block *sb = inode->i_sb;
3853	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3854	int ret = 0;
3855
3856	/*
3857	 * is it from the current transaction,
3858	 * or from an unknown transaction?
3859	 */
3860	if (id == journal->j_trans_id) {
3861		jl = journal->j_current_jl;
3862		/*
3863		 * try to let other writers come in and
3864		 * grow this transaction
3865		 */
3866		let_transaction_grow(sb, id);
3867		if (journal->j_trans_id != id) {
3868			goto flush_commit_only;
3869		}
3870
3871		ret = journal_begin(&th, sb, 1);
3872		if (ret)
3873			return ret;
3874
3875		/* someone might have ended this transaction while we joined */
3876		if (journal->j_trans_id != id) {
3877			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3878						     1);
3879			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
3880			ret = journal_end(&th);
3881			goto flush_commit_only;
3882		}
3883
3884		ret = journal_end_sync(&th);
3885		if (!ret)
3886			ret = 1;
3887
3888	} else {
3889		/*
3890		 * this gets tricky, we have to make sure the journal list in
3891		 * the inode still exists.  We know the list is still around
3892		 * if we've got a larger transaction id than the oldest list
3893		 */
3894flush_commit_only:
3895		if (journal_list_still_alive(inode->i_sb, id)) {
3896			/*
3897			 * we only set ret to 1 when we know for sure
3898			 * the barrier hasn't been started yet on the commit
3899			 * block.
3900			 */
3901			if (atomic_read(&jl->j_commit_left) > 1)
3902				ret = 1;
3903			flush_commit_list(sb, jl, 1);
3904			if (journal->j_errno)
3905				ret = journal->j_errno;
3906		}
3907	}
3908	/* otherwise the list is gone, and long since committed */
3909	return ret;
3910}
3911
3912int reiserfs_commit_for_inode(struct inode *inode)
3913{
3914	unsigned int id = REISERFS_I(inode)->i_trans_id;
3915	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
3916
3917	/*
3918	 * for the whole inode, assume unset id means it was
3919	 * changed in the current transaction.  More conservative
3920	 */
3921	if (!id || !jl) {
3922		reiserfs_update_inode_transaction(inode);
3923		id = REISERFS_I(inode)->i_trans_id;
3924		/* jl will be updated in __commit_trans_jl */
3925	}
3926
3927	return __commit_trans_jl(inode, id, jl);
3928}
3929
3930void reiserfs_restore_prepared_buffer(struct super_block *sb,
3931				      struct buffer_head *bh)
3932{
3933	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3934	PROC_INFO_INC(sb, journal.restore_prepared);
3935	if (!bh) {
3936		return;
3937	}
3938	if (test_clear_buffer_journal_restore_dirty(bh) &&
3939	    buffer_journal_dirty(bh)) {
3940		struct reiserfs_journal_cnode *cn;
3941		reiserfs_write_lock(sb);
3942		cn = get_journal_hash_dev(sb,
3943					  journal->j_list_hash_table,
3944					  bh->b_blocknr);
3945		if (cn && can_dirty(cn)) {
3946			set_buffer_journal_test(bh);
3947			mark_buffer_dirty(bh);
3948		}
3949		reiserfs_write_unlock(sb);
3950	}
3951	clear_buffer_journal_prepared(bh);
3952}
3953
3954extern struct tree_balance *cur_tb;
3955/*
3956 * before we can change a metadata block, we have to make sure it won't
3957 * be written to disk while we are altering it.  So, we must:
3958 * clean it
3959 * wait on it.
3960 */
3961int reiserfs_prepare_for_journal(struct super_block *sb,
3962				 struct buffer_head *bh, int wait)
3963{
3964	PROC_INFO_INC(sb, journal.prepare);
3965
3966	if (!trylock_buffer(bh)) {
3967		if (!wait)
3968			return 0;
3969		lock_buffer(bh);
3970	}
3971	set_buffer_journal_prepared(bh);
3972	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
3973		clear_buffer_journal_test(bh);
3974		set_buffer_journal_restore_dirty(bh);
3975	}
3976	unlock_buffer(bh);
3977	return 1;
3978}
3979
3980/*
3981 * long and ugly.  If flush, will not return until all commit
3982 * blocks and all real buffers in the trans are on disk.
3983 * If no_async, won't return until all commit blocks are on disk.
3984 *
3985 * keep reading, there are comments as you go along
3986 *
3987 * If the journal is aborted, we just clean up. Things like flushing
3988 * journal lists, etc just won't happen.
3989 */
3990static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
3991{
3992	struct super_block *sb = th->t_super;
3993	struct reiserfs_journal *journal = SB_JOURNAL(sb);
3994	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
3995	struct reiserfs_journal_cnode *last_cn = NULL;
3996	struct reiserfs_journal_desc *desc;
3997	struct reiserfs_journal_commit *commit;
3998	struct buffer_head *c_bh;	/* commit bh */
3999	struct buffer_head *d_bh;	/* desc bh */
4000	int cur_write_start = 0;	/* start index of current log write */
4001	int i;
4002	int flush;
4003	int wait_on_commit;
4004	struct reiserfs_journal_list *jl, *temp_jl;
4005	struct list_head *entry, *safe;
4006	unsigned long jindex;
4007	unsigned int commit_trans_id;
4008	int trans_half;
4009	int depth;
4010
4011	BUG_ON(th->t_refcount > 1);
4012	BUG_ON(!th->t_trans_id);
4013	BUG_ON(!th->t_super);
4014
4015	/*
4016	 * protect flush_older_commits from doing mistakes if the
4017	 * transaction ID counter gets overflowed.
4018	 */
4019	if (th->t_trans_id == ~0U)
4020		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
4021	flush = flags & FLUSH_ALL;
4022	wait_on_commit = flags & WAIT;
4023
4024	current->journal_info = th->t_handle_save;
4025	reiserfs_check_lock_depth(sb, "journal end");
4026	if (journal->j_len == 0) {
4027		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
4028					     1);
4029		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
4030	}
4031
4032	lock_journal(sb);
4033	if (journal->j_next_full_flush) {
4034		flags |= FLUSH_ALL;
4035		flush = 1;
4036	}
4037	if (journal->j_next_async_flush) {
4038		flags |= COMMIT_NOW | WAIT;
4039		wait_on_commit = 1;
4040	}
4041
4042	/*
4043	 * check_journal_end locks the journal, and unlocks if it does
4044	 * not return 1 it tells us if we should continue with the
4045	 * journal_end, or just return
4046	 */
4047	if (!check_journal_end(th, flags)) {
4048		reiserfs_schedule_old_flush(sb);
4049		wake_queued_writers(sb);
4050		reiserfs_async_progress_wait(sb);
4051		goto out;
4052	}
4053
4054	/* check_journal_end might set these, check again */
4055	if (journal->j_next_full_flush) {
4056		flush = 1;
4057	}
4058
4059	/*
4060	 * j must wait means we have to flush the log blocks, and the
4061	 * real blocks for this transaction
4062	 */
4063	if (journal->j_must_wait > 0) {
4064		flush = 1;
4065	}
4066#ifdef REISERFS_PREALLOCATE
4067	/*
4068	 * quota ops might need to nest, setup the journal_info pointer
4069	 * for them and raise the refcount so that it is > 0.
4070	 */
4071	current->journal_info = th;
4072	th->t_refcount++;
4073
4074	/* it should not involve new blocks into the transaction */
4075	reiserfs_discard_all_prealloc(th);
4076
4077	th->t_refcount--;
4078	current->journal_info = th->t_handle_save;
4079#endif
4080
4081	/* setup description block */
4082	d_bh =
4083	    journal_getblk(sb,
4084			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4085			   journal->j_start);
4086	set_buffer_uptodate(d_bh);
4087	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
4088	memset(d_bh->b_data, 0, d_bh->b_size);
4089	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
4090	set_desc_trans_id(desc, journal->j_trans_id);
4091
4092	/*
4093	 * setup commit block.  Don't write (keep it clean too) this one
4094	 * until after everyone else is written
4095	 */
4096	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4097			      ((journal->j_start + journal->j_len +
4098				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
4099	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
4100	memset(c_bh->b_data, 0, c_bh->b_size);
4101	set_commit_trans_id(commit, journal->j_trans_id);
4102	set_buffer_uptodate(c_bh);
4103
4104	/* init this journal list */
4105	jl = journal->j_current_jl;
4106
4107	/*
4108	 * we lock the commit before doing anything because
4109	 * we want to make sure nobody tries to run flush_commit_list until
4110	 * the new transaction is fully setup, and we've already flushed the
4111	 * ordered bh list
4112	 */
4113	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
4114
4115	/* save the transaction id in case we need to commit it later */
4116	commit_trans_id = jl->j_trans_id;
4117
4118	atomic_set(&jl->j_older_commits_done, 0);
4119	jl->j_trans_id = journal->j_trans_id;
4120	jl->j_timestamp = journal->j_trans_start_time;
4121	jl->j_commit_bh = c_bh;
4122	jl->j_start = journal->j_start;
4123	jl->j_len = journal->j_len;
4124	atomic_set(&jl->j_nonzerolen, journal->j_len);
4125	atomic_set(&jl->j_commit_left, journal->j_len + 2);
4126	jl->j_realblock = NULL;
4127
4128	/*
4129	 * The ENTIRE FOR LOOP MUST not cause schedule to occur.
4130	 * for each real block, add it to the journal list hash,
4131	 * copy into real block index array in the commit or desc block
4132	 */
4133	trans_half = journal_trans_half(sb->s_blocksize);
4134	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
4135		if (buffer_journaled(cn->bh)) {
4136			jl_cn = get_cnode(sb);
4137			if (!jl_cn) {
4138				reiserfs_panic(sb, "journal-1676",
4139					       "get_cnode returned NULL");
4140			}
4141			if (i == 0) {
4142				jl->j_realblock = jl_cn;
4143			}
4144			jl_cn->prev = last_cn;
4145			jl_cn->next = NULL;
4146			if (last_cn) {
4147				last_cn->next = jl_cn;
4148			}
4149			last_cn = jl_cn;
4150			/*
4151			 * make sure the block we are trying to log
4152			 * is not a block of journal or reserved area
4153			 */
4154			if (is_block_in_log_or_reserved_area
4155			    (sb, cn->bh->b_blocknr)) {
4156				reiserfs_panic(sb, "journal-2332",
4157					       "Trying to log block %lu, "
4158					       "which is a log block",
4159					       cn->bh->b_blocknr);
4160			}
4161			jl_cn->blocknr = cn->bh->b_blocknr;
4162			jl_cn->state = 0;
4163			jl_cn->sb = sb;
4164			jl_cn->bh = cn->bh;
4165			jl_cn->jlist = jl;
4166			insert_journal_hash(journal->j_list_hash_table, jl_cn);
4167			if (i < trans_half) {
4168				desc->j_realblock[i] =
4169				    cpu_to_le32(cn->bh->b_blocknr);
4170			} else {
4171				commit->j_realblock[i - trans_half] =
4172				    cpu_to_le32(cn->bh->b_blocknr);
4173			}
4174		} else {
4175			i--;
4176		}
4177	}
4178	set_desc_trans_len(desc, journal->j_len);
4179	set_desc_mount_id(desc, journal->j_mount_id);
4180	set_desc_trans_id(desc, journal->j_trans_id);
4181	set_commit_trans_len(commit, journal->j_len);
4182
4183	/*
4184	 * special check in case all buffers in the journal
4185	 * were marked for not logging
4186	 */
4187	BUG_ON(journal->j_len == 0);
4188
4189	/*
4190	 * we're about to dirty all the log blocks, mark the description block
4191	 * dirty now too.  Don't mark the commit block dirty until all the
4192	 * others are on disk
4193	 */
4194	mark_buffer_dirty(d_bh);
4195
4196	/*
4197	 * first data block is j_start + 1, so add one to
4198	 * cur_write_start wherever you use it
4199	 */
4200	cur_write_start = journal->j_start;
4201	cn = journal->j_first;
4202	jindex = 1;	/* start at one so we don't get the desc again */
4203	while (cn) {
4204		clear_buffer_journal_new(cn->bh);
4205		/* copy all the real blocks into log area.  dirty log blocks */
4206		if (buffer_journaled(cn->bh)) {
4207			struct buffer_head *tmp_bh;
4208			char *addr;
4209			struct page *page;
4210			tmp_bh =
4211			    journal_getblk(sb,
4212					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4213					   ((cur_write_start +
4214					     jindex) %
4215					    SB_ONDISK_JOURNAL_SIZE(sb)));
4216			set_buffer_uptodate(tmp_bh);
4217			page = cn->bh->b_page;
4218			addr = kmap(page);
4219			memcpy(tmp_bh->b_data,
4220			       addr + offset_in_page(cn->bh->b_data),
4221			       cn->bh->b_size);
4222			kunmap(page);
4223			mark_buffer_dirty(tmp_bh);
4224			jindex++;
4225			set_buffer_journal_dirty(cn->bh);
4226			clear_buffer_journaled(cn->bh);
4227		} else {
4228			/*
4229			 * JDirty cleared sometime during transaction.
4230			 * don't log this one
4231			 */
4232			reiserfs_warning(sb, "journal-2048",
4233					 "BAD, buffer in journal hash, "
4234					 "but not JDirty!");
4235			brelse(cn->bh);
4236		}
4237		next = cn->next;
4238		free_cnode(sb, cn);
4239		cn = next;
4240		reiserfs_cond_resched(sb);
4241	}
4242
4243	/*
4244	 * we are done with both the c_bh and d_bh, but
4245	 * c_bh must be written after all other commit blocks,
4246	 * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
4247	 */
4248
4249	journal->j_current_jl = alloc_journal_list(sb);
4250
4251	/* now it is safe to insert this transaction on the main list */
4252	list_add_tail(&jl->j_list, &journal->j_journal_list);
4253	list_add_tail(&jl->j_working_list, &journal->j_working_list);
4254	journal->j_num_work_lists++;
4255
4256	/* reset journal values for the next transaction */
4257	journal->j_start =
4258	    (journal->j_start + journal->j_len +
4259	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
4260	atomic_set(&journal->j_wcount, 0);
4261	journal->j_bcount = 0;
4262	journal->j_last = NULL;
4263	journal->j_first = NULL;
4264	journal->j_len = 0;
4265	journal->j_trans_start_time = 0;
4266	/* check for trans_id overflow */
4267	if (++journal->j_trans_id == 0)
4268		journal->j_trans_id = 10;
4269	journal->j_current_jl->j_trans_id = journal->j_trans_id;
4270	journal->j_must_wait = 0;
4271	journal->j_len_alloc = 0;
4272	journal->j_next_full_flush = 0;
4273	journal->j_next_async_flush = 0;
4274	init_journal_hash(sb);
4275
4276	/*
4277	 * make sure reiserfs_add_jh sees the new current_jl before we
4278	 * write out the tails
4279	 */
4280	smp_mb();
4281
4282	/*
4283	 * tail conversion targets have to hit the disk before we end the
4284	 * transaction.  Otherwise a later transaction might repack the tail
4285	 * before this transaction commits, leaving the data block unflushed
4286	 * and clean, if we crash before the later transaction commits, the
4287	 * data block is lost.
4288	 */
4289	if (!list_empty(&jl->j_tail_bh_list)) {
4290		depth = reiserfs_write_unlock_nested(sb);
4291		write_ordered_buffers(&journal->j_dirty_buffers_lock,
4292				      journal, jl, &jl->j_tail_bh_list);
4293		reiserfs_write_lock_nested(sb, depth);
4294	}
4295	BUG_ON(!list_empty(&jl->j_tail_bh_list));
4296	mutex_unlock(&jl->j_commit_mutex);
4297
4298	/*
4299	 * honor the flush wishes from the caller, simple commits can
4300	 * be done outside the journal lock, they are done below
4301	 *
4302	 * if we don't flush the commit list right now, we put it into
4303	 * the work queue so the people waiting on the async progress work
4304	 * queue don't wait for this proc to flush journal lists and such.
4305	 */
4306	if (flush) {
4307		flush_commit_list(sb, jl, 1);
4308		flush_journal_list(sb, jl, 1);
4309	} else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
4310		/*
4311		 * Avoid queueing work when sb is being shut down. Transaction
4312		 * will be flushed on journal shutdown.
4313		 */
4314		if (sb->s_flags & SB_ACTIVE)
4315			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
4316					   &journal->j_work, HZ / 10);
4317	}
4318
4319	/*
4320	 * if the next transaction has any chance of wrapping, flush
4321	 * transactions that might get overwritten.  If any journal lists
4322	 * are very old flush them as well.
4323	 */
4324first_jl:
4325	list_for_each_safe(entry, safe, &journal->j_journal_list) {
4326		temp_jl = JOURNAL_LIST_ENTRY(entry);
4327		if (journal->j_start <= temp_jl->j_start) {
4328			if ((journal->j_start + journal->j_trans_max + 1) >=
4329			    temp_jl->j_start) {
4330				flush_used_journal_lists(sb, temp_jl);
4331				goto first_jl;
4332			} else if ((journal->j_start +
4333				    journal->j_trans_max + 1) <
4334				   SB_ONDISK_JOURNAL_SIZE(sb)) {
4335				/*
4336				 * if we don't cross into the next
4337				 * transaction and we don't wrap, there is
4338				 * no way we can overlap any later transactions
4339				 * break now
4340				 */
4341				break;
4342			}
4343		} else if ((journal->j_start +
4344			    journal->j_trans_max + 1) >
4345			   SB_ONDISK_JOURNAL_SIZE(sb)) {
4346			if (((journal->j_start + journal->j_trans_max + 1) %
4347			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
4348			    temp_jl->j_start) {
4349				flush_used_journal_lists(sb, temp_jl);
4350				goto first_jl;
4351			} else {
4352				/*
4353				* we don't overlap anything from out start
4354				* to the end of the log, and our wrapped
4355				* portion doesn't overlap anything at
4356				* the start of the log.  We can break
4357				*/
4358				break;
4359			}
4360		}
4361	}
4362
4363	journal->j_current_jl->j_list_bitmap =
4364	    get_list_bitmap(sb, journal->j_current_jl);
4365
4366	if (!(journal->j_current_jl->j_list_bitmap)) {
4367		reiserfs_panic(sb, "journal-1996",
4368			       "could not get a list bitmap");
4369	}
4370
4371	atomic_set(&journal->j_jlock, 0);
4372	unlock_journal(sb);
4373	/* wake up any body waiting to join. */
4374	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
4375	wake_up(&journal->j_join_wait);
4376
4377	if (!flush && wait_on_commit &&
4378	    journal_list_still_alive(sb, commit_trans_id)) {
4379		flush_commit_list(sb, jl, 1);
4380	}
4381out:
4382	reiserfs_check_lock_depth(sb, "journal end2");
4383
4384	memset(th, 0, sizeof(*th));
4385	/*
4386	 * Re-set th->t_super, so we can properly keep track of how many
4387	 * persistent transactions there are. We need to do this so if this
4388	 * call is part of a failed restart_transaction, we can free it later
4389	 */
4390	th->t_super = sb;
4391
4392	return journal->j_errno;
4393}
4394
4395/* Send the file system read only and refuse new transactions */
4396void reiserfs_abort_journal(struct super_block *sb, int errno)
4397{
4398	struct reiserfs_journal *journal = SB_JOURNAL(sb);
4399	if (test_bit(J_ABORTED, &journal->j_state))
4400		return;
4401
4402	if (!journal->j_errno)
4403		journal->j_errno = errno;
4404
4405	sb->s_flags |= SB_RDONLY;
4406	set_bit(J_ABORTED, &journal->j_state);
4407
4408#ifdef CONFIG_REISERFS_CHECK
4409	dump_stack();
4410#endif
4411}
4412