xref: /kernel/linux/linux-5.10/fs/jfs/jfs_txnmgr.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *   Copyright (C) International Business Machines Corp., 2000-2005
4 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
5 */
6
7/*
8 *	jfs_txnmgr.c: transaction manager
9 *
10 * notes:
11 * transaction starts with txBegin() and ends with txCommit()
12 * or txAbort().
13 *
14 * tlock is acquired at the time of update;
15 * (obviate scan at commit time for xtree and dtree)
16 * tlock and mp points to each other;
17 * (no hashlist for mp -> tlock).
18 *
19 * special cases:
20 * tlock on in-memory inode:
21 * in-place tlock in the in-memory inode itself;
22 * converted to page lock by iWrite() at commit time.
23 *
24 * tlock during write()/mmap() under anonymous transaction (tid = 0):
25 * transferred (?) to transaction at commit time.
26 *
27 * use the page itself to update allocation maps
28 * (obviate intermediate replication of allocation/deallocation data)
29 * hold on to mp+lock thru update of maps
30 */
31
32#include <linux/fs.h>
33#include <linux/vmalloc.h>
34#include <linux/completion.h>
35#include <linux/freezer.h>
36#include <linux/module.h>
37#include <linux/moduleparam.h>
38#include <linux/kthread.h>
39#include <linux/seq_file.h>
40#include "jfs_incore.h"
41#include "jfs_inode.h"
42#include "jfs_filsys.h"
43#include "jfs_metapage.h"
44#include "jfs_dinode.h"
45#include "jfs_imap.h"
46#include "jfs_dmap.h"
47#include "jfs_superblock.h"
48#include "jfs_debug.h"
49
50/*
51 *	transaction management structures
52 */
53static struct {
54	int freetid;		/* index of a free tid structure */
55	int freelock;		/* index first free lock word */
56	wait_queue_head_t freewait;	/* eventlist of free tblock */
57	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
58	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
59	int tlocksInUse;	/* Number of tlocks in use */
60	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
61/*	struct tblock *sync_queue; * Transactions waiting for data sync */
62	struct list_head unlock_queue;	/* Txns waiting to be released */
63	struct list_head anon_list;	/* inodes having anonymous txns */
64	struct list_head anon_list2;	/* inodes having anonymous txns
65					   that couldn't be sync'ed */
66} TxAnchor;
67
68int jfs_tlocks_low;		/* Indicates low number of available tlocks */
69
70#ifdef CONFIG_JFS_STATISTICS
71static struct {
72	uint txBegin;
73	uint txBegin_barrier;
74	uint txBegin_lockslow;
75	uint txBegin_freetid;
76	uint txBeginAnon;
77	uint txBeginAnon_barrier;
78	uint txBeginAnon_lockslow;
79	uint txLockAlloc;
80	uint txLockAlloc_freelock;
81} TxStat;
82#endif
83
84static int nTxBlock = -1;	/* number of transaction blocks */
85module_param(nTxBlock, int, 0);
86MODULE_PARM_DESC(nTxBlock,
87		 "Number of transaction blocks (max:65536)");
88
89static int nTxLock = -1;	/* number of transaction locks */
90module_param(nTxLock, int, 0);
91MODULE_PARM_DESC(nTxLock,
92		 "Number of transaction locks (max:65536)");
93
94struct tblock *TxBlock;	/* transaction block table */
95static int TxLockLWM;	/* Low water mark for number of txLocks used */
96static int TxLockHWM;	/* High water mark for number of txLocks used */
97static int TxLockVHWM;	/* Very High water mark */
98struct tlock *TxLock;	/* transaction lock table */
99
100/*
101 *	transaction management lock
102 */
103static DEFINE_SPINLOCK(jfsTxnLock);
104
105#define TXN_LOCK()		spin_lock(&jfsTxnLock)
106#define TXN_UNLOCK()		spin_unlock(&jfsTxnLock)
107
108#define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
109#define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
110#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
111
112static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
113static int jfs_commit_thread_waking;
114
115/*
116 * Retry logic exist outside these macros to protect from spurrious wakeups.
117 */
118static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
119{
120	DECLARE_WAITQUEUE(wait, current);
121
122	add_wait_queue(event, &wait);
123	set_current_state(TASK_UNINTERRUPTIBLE);
124	TXN_UNLOCK();
125	io_schedule();
126	remove_wait_queue(event, &wait);
127}
128
129#define TXN_SLEEP(event)\
130{\
131	TXN_SLEEP_DROP_LOCK(event);\
132	TXN_LOCK();\
133}
134
135#define TXN_WAKEUP(event) wake_up_all(event)
136
137/*
138 *	statistics
139 */
140static struct {
141	tid_t maxtid;		/* 4: biggest tid ever used */
142	lid_t maxlid;		/* 4: biggest lid ever used */
143	int ntid;		/* 4: # of transactions performed */
144	int nlid;		/* 4: # of tlocks acquired */
145	int waitlock;		/* 4: # of tlock wait */
146} stattx;
147
148/*
149 * forward references
150 */
151static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
152		struct tlock * tlck, struct commit * cd);
153static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
154		struct tlock * tlck);
155static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
156		struct tlock * tlck);
157static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
158		struct tlock * tlck);
159static void txAllocPMap(struct inode *ip, struct maplock * maplock,
160		struct tblock * tblk);
161static void txForce(struct tblock * tblk);
162static int txLog(struct jfs_log * log, struct tblock * tblk,
163		struct commit * cd);
164static void txUpdateMap(struct tblock * tblk);
165static void txRelease(struct tblock * tblk);
166static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
167	   struct tlock * tlck);
168static void LogSyncRelease(struct metapage * mp);
169
170/*
171 *		transaction block/lock management
172 *		---------------------------------
173 */
174
175/*
176 * Get a transaction lock from the free list.  If the number in use is
177 * greater than the high water mark, wake up the sync daemon.  This should
178 * free some anonymous transaction locks.  (TXN_LOCK must be held.)
179 */
180static lid_t txLockAlloc(void)
181{
182	lid_t lid;
183
184	INCREMENT(TxStat.txLockAlloc);
185	if (!TxAnchor.freelock) {
186		INCREMENT(TxStat.txLockAlloc_freelock);
187	}
188
189	while (!(lid = TxAnchor.freelock))
190		TXN_SLEEP(&TxAnchor.freelockwait);
191	TxAnchor.freelock = TxLock[lid].next;
192	HIGHWATERMARK(stattx.maxlid, lid);
193	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
194		jfs_info("txLockAlloc tlocks low");
195		jfs_tlocks_low = 1;
196		wake_up_process(jfsSyncThread);
197	}
198
199	return lid;
200}
201
202static void txLockFree(lid_t lid)
203{
204	TxLock[lid].tid = 0;
205	TxLock[lid].next = TxAnchor.freelock;
206	TxAnchor.freelock = lid;
207	TxAnchor.tlocksInUse--;
208	if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
209		jfs_info("txLockFree jfs_tlocks_low no more");
210		jfs_tlocks_low = 0;
211		TXN_WAKEUP(&TxAnchor.lowlockwait);
212	}
213	TXN_WAKEUP(&TxAnchor.freelockwait);
214}
215
216/*
217 * NAME:	txInit()
218 *
219 * FUNCTION:	initialize transaction management structures
220 *
221 * RETURN:
222 *
223 * serialization: single thread at jfs_init()
224 */
225int txInit(void)
226{
227	int k, size;
228	struct sysinfo si;
229
230	/* Set defaults for nTxLock and nTxBlock if unset */
231
232	if (nTxLock == -1) {
233		if (nTxBlock == -1) {
234			/* Base default on memory size */
235			si_meminfo(&si);
236			if (si.totalram > (256 * 1024)) /* 1 GB */
237				nTxLock = 64 * 1024;
238			else
239				nTxLock = si.totalram >> 2;
240		} else if (nTxBlock > (8 * 1024))
241			nTxLock = 64 * 1024;
242		else
243			nTxLock = nTxBlock << 3;
244	}
245	if (nTxBlock == -1)
246		nTxBlock = nTxLock >> 3;
247
248	/* Verify tunable parameters */
249	if (nTxBlock < 16)
250		nTxBlock = 16;	/* No one should set it this low */
251	if (nTxBlock > 65536)
252		nTxBlock = 65536;
253	if (nTxLock < 256)
254		nTxLock = 256;	/* No one should set it this low */
255	if (nTxLock > 65536)
256		nTxLock = 65536;
257
258	printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
259	       nTxBlock, nTxLock);
260	/*
261	 * initialize transaction block (tblock) table
262	 *
263	 * transaction id (tid) = tblock index
264	 * tid = 0 is reserved.
265	 */
266	TxLockLWM = (nTxLock * 4) / 10;
267	TxLockHWM = (nTxLock * 7) / 10;
268	TxLockVHWM = (nTxLock * 8) / 10;
269
270	size = sizeof(struct tblock) * nTxBlock;
271	TxBlock = vmalloc(size);
272	if (TxBlock == NULL)
273		return -ENOMEM;
274
275	for (k = 1; k < nTxBlock - 1; k++) {
276		TxBlock[k].next = k + 1;
277		init_waitqueue_head(&TxBlock[k].gcwait);
278		init_waitqueue_head(&TxBlock[k].waitor);
279	}
280	TxBlock[k].next = 0;
281	init_waitqueue_head(&TxBlock[k].gcwait);
282	init_waitqueue_head(&TxBlock[k].waitor);
283
284	TxAnchor.freetid = 1;
285	init_waitqueue_head(&TxAnchor.freewait);
286
287	stattx.maxtid = 1;	/* statistics */
288
289	/*
290	 * initialize transaction lock (tlock) table
291	 *
292	 * transaction lock id = tlock index
293	 * tlock id = 0 is reserved.
294	 */
295	size = sizeof(struct tlock) * nTxLock;
296	TxLock = vmalloc(size);
297	if (TxLock == NULL) {
298		vfree(TxBlock);
299		return -ENOMEM;
300	}
301
302	/* initialize tlock table */
303	for (k = 1; k < nTxLock - 1; k++)
304		TxLock[k].next = k + 1;
305	TxLock[k].next = 0;
306	init_waitqueue_head(&TxAnchor.freelockwait);
307	init_waitqueue_head(&TxAnchor.lowlockwait);
308
309	TxAnchor.freelock = 1;
310	TxAnchor.tlocksInUse = 0;
311	INIT_LIST_HEAD(&TxAnchor.anon_list);
312	INIT_LIST_HEAD(&TxAnchor.anon_list2);
313
314	LAZY_LOCK_INIT();
315	INIT_LIST_HEAD(&TxAnchor.unlock_queue);
316
317	stattx.maxlid = 1;	/* statistics */
318
319	return 0;
320}
321
322/*
323 * NAME:	txExit()
324 *
325 * FUNCTION:	clean up when module is unloaded
326 */
327void txExit(void)
328{
329	vfree(TxLock);
330	TxLock = NULL;
331	vfree(TxBlock);
332	TxBlock = NULL;
333}
334
335/*
336 * NAME:	txBegin()
337 *
338 * FUNCTION:	start a transaction.
339 *
340 * PARAMETER:	sb	- superblock
341 *		flag	- force for nested tx;
342 *
343 * RETURN:	tid	- transaction id
344 *
345 * note: flag force allows to start tx for nested tx
346 * to prevent deadlock on logsync barrier;
347 */
348tid_t txBegin(struct super_block *sb, int flag)
349{
350	tid_t t;
351	struct tblock *tblk;
352	struct jfs_log *log;
353
354	jfs_info("txBegin: flag = 0x%x", flag);
355	log = JFS_SBI(sb)->log;
356
357	if (!log) {
358		jfs_error(sb, "read-only filesystem\n");
359		return 0;
360	}
361
362	TXN_LOCK();
363
364	INCREMENT(TxStat.txBegin);
365
366      retry:
367	if (!(flag & COMMIT_FORCE)) {
368		/*
369		 * synchronize with logsync barrier
370		 */
371		if (test_bit(log_SYNCBARRIER, &log->flag) ||
372		    test_bit(log_QUIESCE, &log->flag)) {
373			INCREMENT(TxStat.txBegin_barrier);
374			TXN_SLEEP(&log->syncwait);
375			goto retry;
376		}
377	}
378	if (flag == 0) {
379		/*
380		 * Don't begin transaction if we're getting starved for tlocks
381		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
382		 * free tlocks)
383		 */
384		if (TxAnchor.tlocksInUse > TxLockVHWM) {
385			INCREMENT(TxStat.txBegin_lockslow);
386			TXN_SLEEP(&TxAnchor.lowlockwait);
387			goto retry;
388		}
389	}
390
391	/*
392	 * allocate transaction id/block
393	 */
394	if ((t = TxAnchor.freetid) == 0) {
395		jfs_info("txBegin: waiting for free tid");
396		INCREMENT(TxStat.txBegin_freetid);
397		TXN_SLEEP(&TxAnchor.freewait);
398		goto retry;
399	}
400
401	tblk = tid_to_tblock(t);
402
403	if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
404		/* Don't let a non-forced transaction take the last tblk */
405		jfs_info("txBegin: waiting for free tid");
406		INCREMENT(TxStat.txBegin_freetid);
407		TXN_SLEEP(&TxAnchor.freewait);
408		goto retry;
409	}
410
411	TxAnchor.freetid = tblk->next;
412
413	/*
414	 * initialize transaction
415	 */
416
417	/*
418	 * We can't zero the whole thing or we screw up another thread being
419	 * awakened after sleeping on tblk->waitor
420	 *
421	 * memset(tblk, 0, sizeof(struct tblock));
422	 */
423	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
424
425	tblk->sb = sb;
426	++log->logtid;
427	tblk->logtid = log->logtid;
428
429	++log->active;
430
431	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
432	INCREMENT(stattx.ntid);	/* statistics */
433
434	TXN_UNLOCK();
435
436	jfs_info("txBegin: returning tid = %d", t);
437
438	return t;
439}
440
441/*
442 * NAME:	txBeginAnon()
443 *
444 * FUNCTION:	start an anonymous transaction.
445 *		Blocks if logsync or available tlocks are low to prevent
446 *		anonymous tlocks from depleting supply.
447 *
448 * PARAMETER:	sb	- superblock
449 *
450 * RETURN:	none
451 */
452void txBeginAnon(struct super_block *sb)
453{
454	struct jfs_log *log;
455
456	log = JFS_SBI(sb)->log;
457
458	TXN_LOCK();
459	INCREMENT(TxStat.txBeginAnon);
460
461      retry:
462	/*
463	 * synchronize with logsync barrier
464	 */
465	if (test_bit(log_SYNCBARRIER, &log->flag) ||
466	    test_bit(log_QUIESCE, &log->flag)) {
467		INCREMENT(TxStat.txBeginAnon_barrier);
468		TXN_SLEEP(&log->syncwait);
469		goto retry;
470	}
471
472	/*
473	 * Don't begin transaction if we're getting starved for tlocks
474	 */
475	if (TxAnchor.tlocksInUse > TxLockVHWM) {
476		INCREMENT(TxStat.txBeginAnon_lockslow);
477		TXN_SLEEP(&TxAnchor.lowlockwait);
478		goto retry;
479	}
480	TXN_UNLOCK();
481}
482
483/*
484 *	txEnd()
485 *
486 * function: free specified transaction block.
487 *
488 *	logsync barrier processing:
489 *
490 * serialization:
491 */
492void txEnd(tid_t tid)
493{
494	struct tblock *tblk = tid_to_tblock(tid);
495	struct jfs_log *log;
496
497	jfs_info("txEnd: tid = %d", tid);
498	TXN_LOCK();
499
500	/*
501	 * wakeup transactions waiting on the page locked
502	 * by the current transaction
503	 */
504	TXN_WAKEUP(&tblk->waitor);
505
506	log = JFS_SBI(tblk->sb)->log;
507
508	/*
509	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
510	 * otherwise, we would be left with a transaction that may have been
511	 * reused.
512	 *
513	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
514	 * routine.
515	 */
516	if (tblk->flag & tblkGC_LAZY) {
517		jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
518		TXN_UNLOCK();
519
520		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
521		tblk->flag |= tblkGC_UNLOCKED;
522		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
523		return;
524	}
525
526	jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
527
528	assert(tblk->next == 0);
529
530	/*
531	 * insert tblock back on freelist
532	 */
533	tblk->next = TxAnchor.freetid;
534	TxAnchor.freetid = tid;
535
536	/*
537	 * mark the tblock not active
538	 */
539	if (--log->active == 0) {
540		clear_bit(log_FLUSH, &log->flag);
541
542		/*
543		 * synchronize with logsync barrier
544		 */
545		if (test_bit(log_SYNCBARRIER, &log->flag)) {
546			TXN_UNLOCK();
547
548			/* write dirty metadata & forward log syncpt */
549			jfs_syncpt(log, 1);
550
551			jfs_info("log barrier off: 0x%x", log->lsn);
552
553			/* enable new transactions start */
554			clear_bit(log_SYNCBARRIER, &log->flag);
555
556			/* wakeup all waitors for logsync barrier */
557			TXN_WAKEUP(&log->syncwait);
558
559			goto wakeup;
560		}
561	}
562
563	TXN_UNLOCK();
564wakeup:
565	/*
566	 * wakeup all waitors for a free tblock
567	 */
568	TXN_WAKEUP(&TxAnchor.freewait);
569}
570
571/*
572 *	txLock()
573 *
574 * function: acquire a transaction lock on the specified <mp>
575 *
576 * parameter:
577 *
578 * return:	transaction lock id
579 *
580 * serialization:
581 */
582struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
583		     int type)
584{
585	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
586	int dir_xtree = 0;
587	lid_t lid;
588	tid_t xtid;
589	struct tlock *tlck;
590	struct xtlock *xtlck;
591	struct linelock *linelock;
592	xtpage_t *p;
593	struct tblock *tblk;
594
595	TXN_LOCK();
596
597	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
598	    !(mp->xflag & COMMIT_PAGE)) {
599		/*
600		 * Directory inode is special.  It can have both an xtree tlock
601		 * and a dtree tlock associated with it.
602		 */
603		dir_xtree = 1;
604		lid = jfs_ip->xtlid;
605	} else
606		lid = mp->lid;
607
608	/* is page not locked by a transaction ? */
609	if (lid == 0)
610		goto allocateLock;
611
612	jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
613
614	/* is page locked by the requester transaction ? */
615	tlck = lid_to_tlock(lid);
616	if ((xtid = tlck->tid) == tid) {
617		TXN_UNLOCK();
618		goto grantLock;
619	}
620
621	/*
622	 * is page locked by anonymous transaction/lock ?
623	 *
624	 * (page update without transaction (i.e., file write) is
625	 * locked under anonymous transaction tid = 0:
626	 * anonymous tlocks maintained on anonymous tlock list of
627	 * the inode of the page and available to all anonymous
628	 * transactions until txCommit() time at which point
629	 * they are transferred to the transaction tlock list of
630	 * the committing transaction of the inode)
631	 */
632	if (xtid == 0) {
633		tlck->tid = tid;
634		TXN_UNLOCK();
635		tblk = tid_to_tblock(tid);
636		/*
637		 * The order of the tlocks in the transaction is important
638		 * (during truncate, child xtree pages must be freed before
639		 * parent's tlocks change the working map).
640		 * Take tlock off anonymous list and add to tail of
641		 * transaction list
642		 *
643		 * Note:  We really need to get rid of the tid & lid and
644		 * use list_head's.  This code is getting UGLY!
645		 */
646		if (jfs_ip->atlhead == lid) {
647			if (jfs_ip->atltail == lid) {
648				/* only anonymous txn.
649				 * Remove from anon_list
650				 */
651				TXN_LOCK();
652				list_del_init(&jfs_ip->anon_inode_list);
653				TXN_UNLOCK();
654			}
655			jfs_ip->atlhead = tlck->next;
656		} else {
657			lid_t last;
658			for (last = jfs_ip->atlhead;
659			     lid_to_tlock(last)->next != lid;
660			     last = lid_to_tlock(last)->next) {
661				assert(last);
662			}
663			lid_to_tlock(last)->next = tlck->next;
664			if (jfs_ip->atltail == lid)
665				jfs_ip->atltail = last;
666		}
667
668		/* insert the tlock at tail of transaction tlock list */
669
670		if (tblk->next)
671			lid_to_tlock(tblk->last)->next = lid;
672		else
673			tblk->next = lid;
674		tlck->next = 0;
675		tblk->last = lid;
676
677		goto grantLock;
678	}
679
680	goto waitLock;
681
682	/*
683	 * allocate a tlock
684	 */
685      allocateLock:
686	lid = txLockAlloc();
687	tlck = lid_to_tlock(lid);
688
689	/*
690	 * initialize tlock
691	 */
692	tlck->tid = tid;
693
694	TXN_UNLOCK();
695
696	/* mark tlock for meta-data page */
697	if (mp->xflag & COMMIT_PAGE) {
698
699		tlck->flag = tlckPAGELOCK;
700
701		/* mark the page dirty and nohomeok */
702		metapage_nohomeok(mp);
703
704		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
705			 mp, mp->nohomeok, tid, tlck);
706
707		/* if anonymous transaction, and buffer is on the group
708		 * commit synclist, mark inode to show this.  This will
709		 * prevent the buffer from being marked nohomeok for too
710		 * long a time.
711		 */
712		if ((tid == 0) && mp->lsn)
713			set_cflag(COMMIT_Synclist, ip);
714	}
715	/* mark tlock for in-memory inode */
716	else
717		tlck->flag = tlckINODELOCK;
718
719	if (S_ISDIR(ip->i_mode))
720		tlck->flag |= tlckDIRECTORY;
721
722	tlck->type = 0;
723
724	/* bind the tlock and the page */
725	tlck->ip = ip;
726	tlck->mp = mp;
727	if (dir_xtree)
728		jfs_ip->xtlid = lid;
729	else
730		mp->lid = lid;
731
732	/*
733	 * enqueue transaction lock to transaction/inode
734	 */
735	/* insert the tlock at tail of transaction tlock list */
736	if (tid) {
737		tblk = tid_to_tblock(tid);
738		if (tblk->next)
739			lid_to_tlock(tblk->last)->next = lid;
740		else
741			tblk->next = lid;
742		tlck->next = 0;
743		tblk->last = lid;
744	}
745	/* anonymous transaction:
746	 * insert the tlock at head of inode anonymous tlock list
747	 */
748	else {
749		tlck->next = jfs_ip->atlhead;
750		jfs_ip->atlhead = lid;
751		if (tlck->next == 0) {
752			/* This inode's first anonymous transaction */
753			jfs_ip->atltail = lid;
754			TXN_LOCK();
755			list_add_tail(&jfs_ip->anon_inode_list,
756				      &TxAnchor.anon_list);
757			TXN_UNLOCK();
758		}
759	}
760
761	/* initialize type dependent area for linelock */
762	linelock = (struct linelock *) & tlck->lock;
763	linelock->next = 0;
764	linelock->flag = tlckLINELOCK;
765	linelock->maxcnt = TLOCKSHORT;
766	linelock->index = 0;
767
768	switch (type & tlckTYPE) {
769	case tlckDTREE:
770		linelock->l2linesize = L2DTSLOTSIZE;
771		break;
772
773	case tlckXTREE:
774		linelock->l2linesize = L2XTSLOTSIZE;
775
776		xtlck = (struct xtlock *) linelock;
777		xtlck->header.offset = 0;
778		xtlck->header.length = 2;
779
780		if (type & tlckNEW) {
781			xtlck->lwm.offset = XTENTRYSTART;
782		} else {
783			if (mp->xflag & COMMIT_PAGE)
784				p = (xtpage_t *) mp->data;
785			else
786				p = &jfs_ip->i_xtroot;
787			xtlck->lwm.offset =
788			    le16_to_cpu(p->header.nextindex);
789		}
790		xtlck->lwm.length = 0;	/* ! */
791		xtlck->twm.offset = 0;
792		xtlck->hwm.offset = 0;
793
794		xtlck->index = 2;
795		break;
796
797	case tlckINODE:
798		linelock->l2linesize = L2INODESLOTSIZE;
799		break;
800
801	case tlckDATA:
802		linelock->l2linesize = L2DATASLOTSIZE;
803		break;
804
805	default:
806		jfs_err("UFO tlock:0x%p", tlck);
807	}
808
809	/*
810	 * update tlock vector
811	 */
812      grantLock:
813	tlck->type |= type;
814
815	return tlck;
816
817	/*
818	 * page is being locked by another transaction:
819	 */
820      waitLock:
821	/* Only locks on ipimap or ipaimap should reach here */
822	/* assert(jfs_ip->fileset == AGGREGATE_I); */
823	if (jfs_ip->fileset != AGGREGATE_I) {
824		printk(KERN_ERR "txLock: trying to lock locked page!");
825		print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
826			       ip, sizeof(*ip), 0);
827		print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
828			       mp, sizeof(*mp), 0);
829		print_hex_dump(KERN_ERR, "Locker's tblock: ",
830			       DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
831			       sizeof(struct tblock), 0);
832		print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
833			       tlck, sizeof(*tlck), 0);
834		BUG();
835	}
836	INCREMENT(stattx.waitlock);	/* statistics */
837	TXN_UNLOCK();
838	release_metapage(mp);
839	TXN_LOCK();
840	xtid = tlck->tid;	/* reacquire after dropping TXN_LOCK */
841
842	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
843		 tid, xtid, lid);
844
845	/* Recheck everything since dropping TXN_LOCK */
846	if (xtid && (tlck->mp == mp) && (mp->lid == lid))
847		TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
848	else
849		TXN_UNLOCK();
850	jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
851
852	return NULL;
853}
854
855/*
856 * NAME:	txRelease()
857 *
858 * FUNCTION:	Release buffers associated with transaction locks, but don't
859 *		mark homeok yet.  The allows other transactions to modify
860 *		buffers, but won't let them go to disk until commit record
861 *		actually gets written.
862 *
863 * PARAMETER:
864 *		tblk	-
865 *
866 * RETURN:	Errors from subroutines.
867 */
868static void txRelease(struct tblock * tblk)
869{
870	struct metapage *mp;
871	lid_t lid;
872	struct tlock *tlck;
873
874	TXN_LOCK();
875
876	for (lid = tblk->next; lid; lid = tlck->next) {
877		tlck = lid_to_tlock(lid);
878		if ((mp = tlck->mp) != NULL &&
879		    (tlck->type & tlckBTROOT) == 0) {
880			assert(mp->xflag & COMMIT_PAGE);
881			mp->lid = 0;
882		}
883	}
884
885	/*
886	 * wakeup transactions waiting on a page locked
887	 * by the current transaction
888	 */
889	TXN_WAKEUP(&tblk->waitor);
890
891	TXN_UNLOCK();
892}
893
894/*
895 * NAME:	txUnlock()
896 *
897 * FUNCTION:	Initiates pageout of pages modified by tid in journalled
898 *		objects and frees their lockwords.
899 */
900static void txUnlock(struct tblock * tblk)
901{
902	struct tlock *tlck;
903	struct linelock *linelock;
904	lid_t lid, next, llid, k;
905	struct metapage *mp;
906	struct jfs_log *log;
907	int difft, diffp;
908	unsigned long flags;
909
910	jfs_info("txUnlock: tblk = 0x%p", tblk);
911	log = JFS_SBI(tblk->sb)->log;
912
913	/*
914	 * mark page under tlock homeok (its log has been written):
915	 */
916	for (lid = tblk->next; lid; lid = next) {
917		tlck = lid_to_tlock(lid);
918		next = tlck->next;
919
920		jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
921
922		/* unbind page from tlock */
923		if ((mp = tlck->mp) != NULL &&
924		    (tlck->type & tlckBTROOT) == 0) {
925			assert(mp->xflag & COMMIT_PAGE);
926
927			/* hold buffer
928			 */
929			hold_metapage(mp);
930
931			assert(mp->nohomeok > 0);
932			_metapage_homeok(mp);
933
934			/* inherit younger/larger clsn */
935			LOGSYNC_LOCK(log, flags);
936			if (mp->clsn) {
937				logdiff(difft, tblk->clsn, log);
938				logdiff(diffp, mp->clsn, log);
939				if (difft > diffp)
940					mp->clsn = tblk->clsn;
941			} else
942				mp->clsn = tblk->clsn;
943			LOGSYNC_UNLOCK(log, flags);
944
945			assert(!(tlck->flag & tlckFREEPAGE));
946
947			put_metapage(mp);
948		}
949
950		/* insert tlock, and linelock(s) of the tlock if any,
951		 * at head of freelist
952		 */
953		TXN_LOCK();
954
955		llid = ((struct linelock *) & tlck->lock)->next;
956		while (llid) {
957			linelock = (struct linelock *) lid_to_tlock(llid);
958			k = linelock->next;
959			txLockFree(llid);
960			llid = k;
961		}
962		txLockFree(lid);
963
964		TXN_UNLOCK();
965	}
966	tblk->next = tblk->last = 0;
967
968	/*
969	 * remove tblock from logsynclist
970	 * (allocation map pages inherited lsn of tblk and
971	 * has been inserted in logsync list at txUpdateMap())
972	 */
973	if (tblk->lsn) {
974		LOGSYNC_LOCK(log, flags);
975		log->count--;
976		list_del(&tblk->synclist);
977		LOGSYNC_UNLOCK(log, flags);
978	}
979}
980
981/*
982 *	txMaplock()
983 *
984 * function: allocate a transaction lock for freed page/entry;
985 *	for freed page, maplock is used as xtlock/dtlock type;
986 */
987struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
988{
989	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
990	lid_t lid;
991	struct tblock *tblk;
992	struct tlock *tlck;
993	struct maplock *maplock;
994
995	TXN_LOCK();
996
997	/*
998	 * allocate a tlock
999	 */
1000	lid = txLockAlloc();
1001	tlck = lid_to_tlock(lid);
1002
1003	/*
1004	 * initialize tlock
1005	 */
1006	tlck->tid = tid;
1007
1008	/* bind the tlock and the object */
1009	tlck->flag = tlckINODELOCK;
1010	if (S_ISDIR(ip->i_mode))
1011		tlck->flag |= tlckDIRECTORY;
1012	tlck->ip = ip;
1013	tlck->mp = NULL;
1014
1015	tlck->type = type;
1016
1017	/*
1018	 * enqueue transaction lock to transaction/inode
1019	 */
1020	/* insert the tlock at tail of transaction tlock list */
1021	if (tid) {
1022		tblk = tid_to_tblock(tid);
1023		if (tblk->next)
1024			lid_to_tlock(tblk->last)->next = lid;
1025		else
1026			tblk->next = lid;
1027		tlck->next = 0;
1028		tblk->last = lid;
1029	}
1030	/* anonymous transaction:
1031	 * insert the tlock at head of inode anonymous tlock list
1032	 */
1033	else {
1034		tlck->next = jfs_ip->atlhead;
1035		jfs_ip->atlhead = lid;
1036		if (tlck->next == 0) {
1037			/* This inode's first anonymous transaction */
1038			jfs_ip->atltail = lid;
1039			list_add_tail(&jfs_ip->anon_inode_list,
1040				      &TxAnchor.anon_list);
1041		}
1042	}
1043
1044	TXN_UNLOCK();
1045
1046	/* initialize type dependent area for maplock */
1047	maplock = (struct maplock *) & tlck->lock;
1048	maplock->next = 0;
1049	maplock->maxcnt = 0;
1050	maplock->index = 0;
1051
1052	return tlck;
1053}
1054
1055/*
1056 *	txLinelock()
1057 *
1058 * function: allocate a transaction lock for log vector list
1059 */
1060struct linelock *txLinelock(struct linelock * tlock)
1061{
1062	lid_t lid;
1063	struct tlock *tlck;
1064	struct linelock *linelock;
1065
1066	TXN_LOCK();
1067
1068	/* allocate a TxLock structure */
1069	lid = txLockAlloc();
1070	tlck = lid_to_tlock(lid);
1071
1072	TXN_UNLOCK();
1073
1074	/* initialize linelock */
1075	linelock = (struct linelock *) tlck;
1076	linelock->next = 0;
1077	linelock->flag = tlckLINELOCK;
1078	linelock->maxcnt = TLOCKLONG;
1079	linelock->index = 0;
1080	if (tlck->flag & tlckDIRECTORY)
1081		linelock->flag |= tlckDIRECTORY;
1082
1083	/* append linelock after tlock */
1084	linelock->next = tlock->next;
1085	tlock->next = lid;
1086
1087	return linelock;
1088}
1089
1090/*
1091 *		transaction commit management
1092 *		-----------------------------
1093 */
1094
1095/*
1096 * NAME:	txCommit()
1097 *
1098 * FUNCTION:	commit the changes to the objects specified in
1099 *		clist.  For journalled segments only the
1100 *		changes of the caller are committed, ie by tid.
1101 *		for non-journalled segments the data are flushed to
1102 *		disk and then the change to the disk inode and indirect
1103 *		blocks committed (so blocks newly allocated to the
1104 *		segment will be made a part of the segment atomically).
1105 *
1106 *		all of the segments specified in clist must be in
1107 *		one file system. no more than 6 segments are needed
1108 *		to handle all unix svcs.
1109 *
1110 *		if the i_nlink field (i.e. disk inode link count)
1111 *		is zero, and the type of inode is a regular file or
1112 *		directory, or symbolic link , the inode is truncated
1113 *		to zero length. the truncation is committed but the
1114 *		VM resources are unaffected until it is closed (see
1115 *		iput and iclose).
1116 *
1117 * PARAMETER:
1118 *
1119 * RETURN:
1120 *
1121 * serialization:
1122 *		on entry the inode lock on each segment is assumed
1123 *		to be held.
1124 *
1125 * i/o error:
1126 */
1127int txCommit(tid_t tid,		/* transaction identifier */
1128	     int nip,		/* number of inodes to commit */
1129	     struct inode **iplist,	/* list of inode to commit */
1130	     int flag)
1131{
1132	int rc = 0;
1133	struct commit cd;
1134	struct jfs_log *log;
1135	struct tblock *tblk;
1136	struct lrd *lrd;
1137	struct inode *ip;
1138	struct jfs_inode_info *jfs_ip;
1139	int k, n;
1140	ino_t top;
1141	struct super_block *sb;
1142
1143	jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
1144	/* is read-only file system ? */
1145	if (isReadOnly(iplist[0])) {
1146		rc = -EROFS;
1147		goto TheEnd;
1148	}
1149
1150	sb = cd.sb = iplist[0]->i_sb;
1151	cd.tid = tid;
1152
1153	if (tid == 0)
1154		tid = txBegin(sb, 0);
1155	tblk = tid_to_tblock(tid);
1156
1157	/*
1158	 * initialize commit structure
1159	 */
1160	log = JFS_SBI(sb)->log;
1161	cd.log = log;
1162
1163	/* initialize log record descriptor in commit */
1164	lrd = &cd.lrd;
1165	lrd->logtid = cpu_to_le32(tblk->logtid);
1166	lrd->backchain = 0;
1167
1168	tblk->xflag |= flag;
1169
1170	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1171		tblk->xflag |= COMMIT_LAZY;
1172	/*
1173	 *	prepare non-journaled objects for commit
1174	 *
1175	 * flush data pages of non-journaled file
1176	 * to prevent the file getting non-initialized disk blocks
1177	 * in case of crash.
1178	 * (new blocks - )
1179	 */
1180	cd.iplist = iplist;
1181	cd.nip = nip;
1182
1183	/*
1184	 *	acquire transaction lock on (on-disk) inodes
1185	 *
1186	 * update on-disk inode from in-memory inode
1187	 * acquiring transaction locks for AFTER records
1188	 * on the on-disk inode of file object
1189	 *
1190	 * sort the inodes array by inode number in descending order
1191	 * to prevent deadlock when acquiring transaction lock
1192	 * of on-disk inodes on multiple on-disk inode pages by
1193	 * multiple concurrent transactions
1194	 */
1195	for (k = 0; k < cd.nip; k++) {
1196		top = (cd.iplist[k])->i_ino;
1197		for (n = k + 1; n < cd.nip; n++) {
1198			ip = cd.iplist[n];
1199			if (ip->i_ino > top) {
1200				top = ip->i_ino;
1201				cd.iplist[n] = cd.iplist[k];
1202				cd.iplist[k] = ip;
1203			}
1204		}
1205
1206		ip = cd.iplist[k];
1207		jfs_ip = JFS_IP(ip);
1208
1209		/*
1210		 * BUGBUG - This code has temporarily been removed.  The
1211		 * intent is to ensure that any file data is written before
1212		 * the metadata is committed to the journal.  This prevents
1213		 * uninitialized data from appearing in a file after the
1214		 * journal has been replayed.  (The uninitialized data
1215		 * could be sensitive data removed by another user.)
1216		 *
1217		 * The problem now is that we are holding the IWRITELOCK
1218		 * on the inode, and calling filemap_fdatawrite on an
1219		 * unmapped page will cause a deadlock in jfs_get_block.
1220		 *
1221		 * The long term solution is to pare down the use of
1222		 * IWRITELOCK.  We are currently holding it too long.
1223		 * We could also be smarter about which data pages need
1224		 * to be written before the transaction is committed and
1225		 * when we don't need to worry about it at all.
1226		 *
1227		 * if ((!S_ISDIR(ip->i_mode))
1228		 *    && (tblk->flag & COMMIT_DELETE) == 0)
1229		 *	filemap_write_and_wait(ip->i_mapping);
1230		 */
1231
1232		/*
1233		 * Mark inode as not dirty.  It will still be on the dirty
1234		 * inode list, but we'll know not to commit it again unless
1235		 * it gets marked dirty again
1236		 */
1237		clear_cflag(COMMIT_Dirty, ip);
1238
1239		/* inherit anonymous tlock(s) of inode */
1240		if (jfs_ip->atlhead) {
1241			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
1242			tblk->next = jfs_ip->atlhead;
1243			if (!tblk->last)
1244				tblk->last = jfs_ip->atltail;
1245			jfs_ip->atlhead = jfs_ip->atltail = 0;
1246			TXN_LOCK();
1247			list_del_init(&jfs_ip->anon_inode_list);
1248			TXN_UNLOCK();
1249		}
1250
1251		/*
1252		 * acquire transaction lock on on-disk inode page
1253		 * (become first tlock of the tblk's tlock list)
1254		 */
1255		if (((rc = diWrite(tid, ip))))
1256			goto out;
1257	}
1258
1259	/*
1260	 *	write log records from transaction locks
1261	 *
1262	 * txUpdateMap() resets XAD_NEW in XAD.
1263	 */
1264	if ((rc = txLog(log, tblk, &cd)))
1265		goto TheEnd;
1266
1267	/*
1268	 * Ensure that inode isn't reused before
1269	 * lazy commit thread finishes processing
1270	 */
1271	if (tblk->xflag & COMMIT_DELETE) {
1272		ihold(tblk->u.ip);
1273		/*
1274		 * Avoid a rare deadlock
1275		 *
1276		 * If the inode is locked, we may be blocked in
1277		 * jfs_commit_inode.  If so, we don't want the
1278		 * lazy_commit thread doing the last iput() on the inode
1279		 * since that may block on the locked inode.  Instead,
1280		 * commit the transaction synchronously, so the last iput
1281		 * will be done by the calling thread (or later)
1282		 */
1283		/*
1284		 * I believe this code is no longer needed.  Splitting I_LOCK
1285		 * into two bits, I_NEW and I_SYNC should prevent this
1286		 * deadlock as well.  But since I don't have a JFS testload
1287		 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
1288		 * Joern
1289		 */
1290		if (tblk->u.ip->i_state & I_SYNC)
1291			tblk->xflag &= ~COMMIT_LAZY;
1292	}
1293
1294	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
1295	       ((tblk->u.ip->i_nlink == 0) &&
1296		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
1297
1298	/*
1299	 *	write COMMIT log record
1300	 */
1301	lrd->type = cpu_to_le16(LOG_COMMIT);
1302	lrd->length = 0;
1303	lmLog(log, tblk, lrd, NULL);
1304
1305	lmGroupCommit(log, tblk);
1306
1307	/*
1308	 *	- transaction is now committed -
1309	 */
1310
1311	/*
1312	 * force pages in careful update
1313	 * (imap addressing structure update)
1314	 */
1315	if (flag & COMMIT_FORCE)
1316		txForce(tblk);
1317
1318	/*
1319	 *	update allocation map.
1320	 *
1321	 * update inode allocation map and inode:
1322	 * free pager lock on memory object of inode if any.
1323	 * update block allocation map.
1324	 *
1325	 * txUpdateMap() resets XAD_NEW in XAD.
1326	 */
1327	if (tblk->xflag & COMMIT_FORCE)
1328		txUpdateMap(tblk);
1329
1330	/*
1331	 *	free transaction locks and pageout/free pages
1332	 */
1333	txRelease(tblk);
1334
1335	if ((tblk->flag & tblkGC_LAZY) == 0)
1336		txUnlock(tblk);
1337
1338
1339	/*
1340	 *	reset in-memory object state
1341	 */
1342	for (k = 0; k < cd.nip; k++) {
1343		ip = cd.iplist[k];
1344		jfs_ip = JFS_IP(ip);
1345
1346		/*
1347		 * reset in-memory inode state
1348		 */
1349		jfs_ip->bxflag = 0;
1350		jfs_ip->blid = 0;
1351	}
1352
1353      out:
1354	if (rc != 0)
1355		txAbort(tid, 1);
1356
1357      TheEnd:
1358	jfs_info("txCommit: tid = %d, returning %d", tid, rc);
1359	return rc;
1360}
1361
1362/*
1363 * NAME:	txLog()
1364 *
1365 * FUNCTION:	Writes AFTER log records for all lines modified
1366 *		by tid for segments specified by inodes in comdata.
1367 *		Code assumes only WRITELOCKS are recorded in lockwords.
1368 *
1369 * PARAMETERS:
1370 *
1371 * RETURN :
1372 */
1373static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1374{
1375	int rc = 0;
1376	struct inode *ip;
1377	lid_t lid;
1378	struct tlock *tlck;
1379	struct lrd *lrd = &cd->lrd;
1380
1381	/*
1382	 * write log record(s) for each tlock of transaction,
1383	 */
1384	for (lid = tblk->next; lid; lid = tlck->next) {
1385		tlck = lid_to_tlock(lid);
1386
1387		tlck->flag |= tlckLOG;
1388
1389		/* initialize lrd common */
1390		ip = tlck->ip;
1391		lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
1392		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
1393		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
1394
1395		/* write log record of page from the tlock */
1396		switch (tlck->type & tlckTYPE) {
1397		case tlckXTREE:
1398			xtLog(log, tblk, lrd, tlck);
1399			break;
1400
1401		case tlckDTREE:
1402			dtLog(log, tblk, lrd, tlck);
1403			break;
1404
1405		case tlckINODE:
1406			diLog(log, tblk, lrd, tlck, cd);
1407			break;
1408
1409		case tlckMAP:
1410			mapLog(log, tblk, lrd, tlck);
1411			break;
1412
1413		case tlckDATA:
1414			dataLog(log, tblk, lrd, tlck);
1415			break;
1416
1417		default:
1418			jfs_err("UFO tlock:0x%p", tlck);
1419		}
1420	}
1421
1422	return rc;
1423}
1424
1425/*
1426 *	diLog()
1427 *
1428 * function:	log inode tlock and format maplock to update bmap;
1429 */
1430static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1431		 struct tlock * tlck, struct commit * cd)
1432{
1433	int rc = 0;
1434	struct metapage *mp;
1435	pxd_t *pxd;
1436	struct pxd_lock *pxdlock;
1437
1438	mp = tlck->mp;
1439
1440	/* initialize as REDOPAGE record format */
1441	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
1442	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
1443
1444	pxd = &lrd->log.redopage.pxd;
1445
1446	/*
1447	 *	inode after image
1448	 */
1449	if (tlck->type & tlckENTRY) {
1450		/* log after-image for logredo(): */
1451		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1452		PXDaddress(pxd, mp->index);
1453		PXDlength(pxd,
1454			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1455		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1456
1457		/* mark page as homeward bound */
1458		tlck->flag |= tlckWRITEPAGE;
1459	} else if (tlck->type & tlckFREE) {
1460		/*
1461		 *	free inode extent
1462		 *
1463		 * (pages of the freed inode extent have been invalidated and
1464		 * a maplock for free of the extent has been formatted at
1465		 * txLock() time);
1466		 *
1467		 * the tlock had been acquired on the inode allocation map page
1468		 * (iag) that specifies the freed extent, even though the map
1469		 * page is not itself logged, to prevent pageout of the map
1470		 * page before the log;
1471		 */
1472
1473		/* log LOG_NOREDOINOEXT of the freed inode extent for
1474		 * logredo() to start NoRedoPage filters, and to update
1475		 * imap and bmap for free of the extent;
1476		 */
1477		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
1478		/*
1479		 * For the LOG_NOREDOINOEXT record, we need
1480		 * to pass the IAG number and inode extent
1481		 * index (within that IAG) from which the
1482		 * the extent being released.  These have been
1483		 * passed to us in the iplist[1] and iplist[2].
1484		 */
1485		lrd->log.noredoinoext.iagnum =
1486		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
1487		lrd->log.noredoinoext.inoext_idx =
1488		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
1489
1490		pxdlock = (struct pxd_lock *) & tlck->lock;
1491		*pxd = pxdlock->pxd;
1492		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1493
1494		/* update bmap */
1495		tlck->flag |= tlckUPDATEMAP;
1496
1497		/* mark page as homeward bound */
1498		tlck->flag |= tlckWRITEPAGE;
1499	} else
1500		jfs_err("diLog: UFO type tlck:0x%p", tlck);
1501#ifdef  _JFS_WIP
1502	/*
1503	 *	alloc/free external EA extent
1504	 *
1505	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1506	 * of the extent has been formatted at txLock() time;
1507	 */
1508	else {
1509		assert(tlck->type & tlckEA);
1510
1511		/* log LOG_UPDATEMAP for logredo() to update bmap for
1512		 * alloc of new (and free of old) external EA extent;
1513		 */
1514		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1515		pxdlock = (struct pxd_lock *) & tlck->lock;
1516		nlock = pxdlock->index;
1517		for (i = 0; i < nlock; i++, pxdlock++) {
1518			if (pxdlock->flag & mlckALLOCPXD)
1519				lrd->log.updatemap.type =
1520				    cpu_to_le16(LOG_ALLOCPXD);
1521			else
1522				lrd->log.updatemap.type =
1523				    cpu_to_le16(LOG_FREEPXD);
1524			lrd->log.updatemap.nxd = cpu_to_le16(1);
1525			lrd->log.updatemap.pxd = pxdlock->pxd;
1526			lrd->backchain =
1527			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1528		}
1529
1530		/* update bmap */
1531		tlck->flag |= tlckUPDATEMAP;
1532	}
1533#endif				/* _JFS_WIP */
1534
1535	return rc;
1536}
1537
1538/*
1539 *	dataLog()
1540 *
1541 * function:	log data tlock
1542 */
1543static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1544	    struct tlock * tlck)
1545{
1546	struct metapage *mp;
1547	pxd_t *pxd;
1548
1549	mp = tlck->mp;
1550
1551	/* initialize as REDOPAGE record format */
1552	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
1553	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
1554
1555	pxd = &lrd->log.redopage.pxd;
1556
1557	/* log after-image for logredo(): */
1558	lrd->type = cpu_to_le16(LOG_REDOPAGE);
1559
1560	if (jfs_dirtable_inline(tlck->ip)) {
1561		/*
1562		 * The table has been truncated, we've must have deleted
1563		 * the last entry, so don't bother logging this
1564		 */
1565		mp->lid = 0;
1566		grab_metapage(mp);
1567		metapage_homeok(mp);
1568		discard_metapage(mp);
1569		tlck->mp = NULL;
1570		return 0;
1571	}
1572
1573	PXDaddress(pxd, mp->index);
1574	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1575
1576	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1577
1578	/* mark page as homeward bound */
1579	tlck->flag |= tlckWRITEPAGE;
1580
1581	return 0;
1582}
1583
1584/*
1585 *	dtLog()
1586 *
1587 * function:	log dtree tlock and format maplock to update bmap;
1588 */
1589static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1590	   struct tlock * tlck)
1591{
1592	struct metapage *mp;
1593	struct pxd_lock *pxdlock;
1594	pxd_t *pxd;
1595
1596	mp = tlck->mp;
1597
1598	/* initialize as REDOPAGE/NOREDOPAGE record format */
1599	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
1600	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
1601
1602	pxd = &lrd->log.redopage.pxd;
1603
1604	if (tlck->type & tlckBTROOT)
1605		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1606
1607	/*
1608	 *	page extension via relocation: entry insertion;
1609	 *	page extension in-place: entry insertion;
1610	 *	new right page from page split, reinitialized in-line
1611	 *	root from root page split: entry insertion;
1612	 */
1613	if (tlck->type & (tlckNEW | tlckEXTEND)) {
1614		/* log after-image of the new page for logredo():
1615		 * mark log (LOG_NEW) for logredo() to initialize
1616		 * freelist and update bmap for alloc of the new page;
1617		 */
1618		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1619		if (tlck->type & tlckEXTEND)
1620			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
1621		else
1622			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
1623		PXDaddress(pxd, mp->index);
1624		PXDlength(pxd,
1625			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1626		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1627
1628		/* format a maplock for txUpdateMap() to update bPMAP for
1629		 * alloc of the new page;
1630		 */
1631		if (tlck->type & tlckBTROOT)
1632			return;
1633		tlck->flag |= tlckUPDATEMAP;
1634		pxdlock = (struct pxd_lock *) & tlck->lock;
1635		pxdlock->flag = mlckALLOCPXD;
1636		pxdlock->pxd = *pxd;
1637
1638		pxdlock->index = 1;
1639
1640		/* mark page as homeward bound */
1641		tlck->flag |= tlckWRITEPAGE;
1642		return;
1643	}
1644
1645	/*
1646	 *	entry insertion/deletion,
1647	 *	sibling page link update (old right page before split);
1648	 */
1649	if (tlck->type & (tlckENTRY | tlckRELINK)) {
1650		/* log after-image for logredo(): */
1651		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1652		PXDaddress(pxd, mp->index);
1653		PXDlength(pxd,
1654			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1655		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1656
1657		/* mark page as homeward bound */
1658		tlck->flag |= tlckWRITEPAGE;
1659		return;
1660	}
1661
1662	/*
1663	 *	page deletion: page has been invalidated
1664	 *	page relocation: source extent
1665	 *
1666	 *	a maplock for free of the page has been formatted
1667	 *	at txLock() time);
1668	 */
1669	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1670		/* log LOG_NOREDOPAGE of the deleted page for logredo()
1671		 * to start NoRedoPage filter and to update bmap for free
1672		 * of the deletd page
1673		 */
1674		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1675		pxdlock = (struct pxd_lock *) & tlck->lock;
1676		*pxd = pxdlock->pxd;
1677		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1678
1679		/* a maplock for txUpdateMap() for free of the page
1680		 * has been formatted at txLock() time;
1681		 */
1682		tlck->flag |= tlckUPDATEMAP;
1683	}
1684	return;
1685}
1686
1687/*
1688 *	xtLog()
1689 *
1690 * function:	log xtree tlock and format maplock to update bmap;
1691 */
1692static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1693	   struct tlock * tlck)
1694{
1695	struct inode *ip;
1696	struct metapage *mp;
1697	xtpage_t *p;
1698	struct xtlock *xtlck;
1699	struct maplock *maplock;
1700	struct xdlistlock *xadlock;
1701	struct pxd_lock *pxdlock;
1702	pxd_t *page_pxd;
1703	int next, lwm, hwm;
1704
1705	ip = tlck->ip;
1706	mp = tlck->mp;
1707
1708	/* initialize as REDOPAGE/NOREDOPAGE record format */
1709	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
1710	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
1711
1712	page_pxd = &lrd->log.redopage.pxd;
1713
1714	if (tlck->type & tlckBTROOT) {
1715		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1716		p = &JFS_IP(ip)->i_xtroot;
1717		if (S_ISDIR(ip->i_mode))
1718			lrd->log.redopage.type |=
1719			    cpu_to_le16(LOG_DIR_XTREE);
1720	} else
1721		p = (xtpage_t *) mp->data;
1722	next = le16_to_cpu(p->header.nextindex);
1723
1724	xtlck = (struct xtlock *) & tlck->lock;
1725
1726	maplock = (struct maplock *) & tlck->lock;
1727	xadlock = (struct xdlistlock *) maplock;
1728
1729	/*
1730	 *	entry insertion/extension;
1731	 *	sibling page link update (old right page before split);
1732	 */
1733	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1734		/* log after-image for logredo():
1735		 * logredo() will update bmap for alloc of new/extended
1736		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1737		 * after-image of XADlist;
1738		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1739		 * applying the after-image to the meta-data page.
1740		 */
1741		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1742		PXDaddress(page_pxd, mp->index);
1743		PXDlength(page_pxd,
1744			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1745		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1746
1747		/* format a maplock for txUpdateMap() to update bPMAP
1748		 * for alloc of new/extended extents of XAD[lwm:next)
1749		 * from the page itself;
1750		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1751		 */
1752		lwm = xtlck->lwm.offset;
1753		if (lwm == 0)
1754			lwm = XTPAGEMAXSLOT;
1755
1756		if (lwm == next)
1757			goto out;
1758		if (lwm > next) {
1759			jfs_err("xtLog: lwm > next");
1760			goto out;
1761		}
1762		tlck->flag |= tlckUPDATEMAP;
1763		xadlock->flag = mlckALLOCXADLIST;
1764		xadlock->count = next - lwm;
1765		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1766			int i;
1767			pxd_t *pxd;
1768			/*
1769			 * Lazy commit may allow xtree to be modified before
1770			 * txUpdateMap runs.  Copy xad into linelock to
1771			 * preserve correct data.
1772			 *
1773			 * We can fit twice as may pxd's as xads in the lock
1774			 */
1775			xadlock->flag = mlckALLOCPXDLIST;
1776			pxd = xadlock->xdlist = &xtlck->pxdlock;
1777			for (i = 0; i < xadlock->count; i++) {
1778				PXDaddress(pxd, addressXAD(&p->xad[lwm + i]));
1779				PXDlength(pxd, lengthXAD(&p->xad[lwm + i]));
1780				p->xad[lwm + i].flag &=
1781				    ~(XAD_NEW | XAD_EXTENDED);
1782				pxd++;
1783			}
1784		} else {
1785			/*
1786			 * xdlist will point to into inode's xtree, ensure
1787			 * that transaction is not committed lazily.
1788			 */
1789			xadlock->flag = mlckALLOCXADLIST;
1790			xadlock->xdlist = &p->xad[lwm];
1791			tblk->xflag &= ~COMMIT_LAZY;
1792		}
1793		jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d",
1794			 tlck->ip, mp, tlck, lwm, xadlock->count);
1795
1796		maplock->index = 1;
1797
1798	      out:
1799		/* mark page as homeward bound */
1800		tlck->flag |= tlckWRITEPAGE;
1801
1802		return;
1803	}
1804
1805	/*
1806	 *	page deletion: file deletion/truncation (ref. xtTruncate())
1807	 *
1808	 * (page will be invalidated after log is written and bmap
1809	 * is updated from the page);
1810	 */
1811	if (tlck->type & tlckFREE) {
1812		/* LOG_NOREDOPAGE log for NoRedoPage filter:
1813		 * if page free from file delete, NoRedoFile filter from
1814		 * inode image of zero link count will subsume NoRedoPage
1815		 * filters for each page;
1816		 * if page free from file truncattion, write NoRedoPage
1817		 * filter;
1818		 *
1819		 * upadte of block allocation map for the page itself:
1820		 * if page free from deletion and truncation, LOG_UPDATEMAP
1821		 * log for the page itself is generated from processing
1822		 * its parent page xad entries;
1823		 */
1824		/* if page free from file truncation, log LOG_NOREDOPAGE
1825		 * of the deleted page for logredo() to start NoRedoPage
1826		 * filter for the page;
1827		 */
1828		if (tblk->xflag & COMMIT_TRUNCATE) {
1829			/* write NOREDOPAGE for the page */
1830			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1831			PXDaddress(page_pxd, mp->index);
1832			PXDlength(page_pxd,
1833				  mp->logical_size >> tblk->sb->
1834				  s_blocksize_bits);
1835			lrd->backchain =
1836			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1837
1838			if (tlck->type & tlckBTROOT) {
1839				/* Empty xtree must be logged */
1840				lrd->type = cpu_to_le16(LOG_REDOPAGE);
1841				lrd->backchain =
1842				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1843			}
1844		}
1845
1846		/* init LOG_UPDATEMAP of the freed extents
1847		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
1848		 * for logredo() to update bmap;
1849		 */
1850		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1851		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
1852		xtlck = (struct xtlock *) & tlck->lock;
1853		hwm = xtlck->hwm.offset;
1854		lrd->log.updatemap.nxd =
1855		    cpu_to_le16(hwm - XTENTRYSTART + 1);
1856		/* reformat linelock for lmLog() */
1857		xtlck->header.offset = XTENTRYSTART;
1858		xtlck->header.length = hwm - XTENTRYSTART + 1;
1859		xtlck->index = 1;
1860		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1861
1862		/* format a maplock for txUpdateMap() to update bmap
1863		 * to free extents of XAD[XTENTRYSTART:hwm) from the
1864		 * deleted page itself;
1865		 */
1866		tlck->flag |= tlckUPDATEMAP;
1867		xadlock->count = hwm - XTENTRYSTART + 1;
1868		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1869			int i;
1870			pxd_t *pxd;
1871			/*
1872			 * Lazy commit may allow xtree to be modified before
1873			 * txUpdateMap runs.  Copy xad into linelock to
1874			 * preserve correct data.
1875			 *
1876			 * We can fit twice as may pxd's as xads in the lock
1877			 */
1878			xadlock->flag = mlckFREEPXDLIST;
1879			pxd = xadlock->xdlist = &xtlck->pxdlock;
1880			for (i = 0; i < xadlock->count; i++) {
1881				PXDaddress(pxd,
1882					addressXAD(&p->xad[XTENTRYSTART + i]));
1883				PXDlength(pxd,
1884					lengthXAD(&p->xad[XTENTRYSTART + i]));
1885				pxd++;
1886			}
1887		} else {
1888			/*
1889			 * xdlist will point to into inode's xtree, ensure
1890			 * that transaction is not committed lazily.
1891			 */
1892			xadlock->flag = mlckFREEXADLIST;
1893			xadlock->xdlist = &p->xad[XTENTRYSTART];
1894			tblk->xflag &= ~COMMIT_LAZY;
1895		}
1896		jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
1897			 tlck->ip, mp, xadlock->count);
1898
1899		maplock->index = 1;
1900
1901		/* mark page as invalid */
1902		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
1903		    && !(tlck->type & tlckBTROOT))
1904			tlck->flag |= tlckFREEPAGE;
1905		/*
1906		   else (tblk->xflag & COMMIT_PMAP)
1907		   ? release the page;
1908		 */
1909		return;
1910	}
1911
1912	/*
1913	 *	page/entry truncation: file truncation (ref. xtTruncate())
1914	 *
1915	 *	|----------+------+------+---------------|
1916	 *		   |      |      |
1917	 *		   |      |     hwm - hwm before truncation
1918	 *		   |     next - truncation point
1919	 *		  lwm - lwm before truncation
1920	 * header ?
1921	 */
1922	if (tlck->type & tlckTRUNCATE) {
1923		pxd_t pxd;	/* truncated extent of xad */
1924		int twm;
1925
1926		/*
1927		 * For truncation the entire linelock may be used, so it would
1928		 * be difficult to store xad list in linelock itself.
1929		 * Therefore, we'll just force transaction to be committed
1930		 * synchronously, so that xtree pages won't be changed before
1931		 * txUpdateMap runs.
1932		 */
1933		tblk->xflag &= ~COMMIT_LAZY;
1934		lwm = xtlck->lwm.offset;
1935		if (lwm == 0)
1936			lwm = XTPAGEMAXSLOT;
1937		hwm = xtlck->hwm.offset;
1938		twm = xtlck->twm.offset;
1939
1940		/*
1941		 *	write log records
1942		 */
1943		/* log after-image for logredo():
1944		 *
1945		 * logredo() will update bmap for alloc of new/extended
1946		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1947		 * after-image of XADlist;
1948		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1949		 * applying the after-image to the meta-data page.
1950		 */
1951		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1952		PXDaddress(page_pxd, mp->index);
1953		PXDlength(page_pxd,
1954			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1955		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1956
1957		/*
1958		 * truncate entry XAD[twm == next - 1]:
1959		 */
1960		if (twm == next - 1) {
1961			/* init LOG_UPDATEMAP for logredo() to update bmap for
1962			 * free of truncated delta extent of the truncated
1963			 * entry XAD[next - 1]:
1964			 * (xtlck->pxdlock = truncated delta extent);
1965			 */
1966			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
1967			/* assert(pxdlock->type & tlckTRUNCATE); */
1968			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1969			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
1970			lrd->log.updatemap.nxd = cpu_to_le16(1);
1971			lrd->log.updatemap.pxd = pxdlock->pxd;
1972			pxd = pxdlock->pxd;	/* save to format maplock */
1973			lrd->backchain =
1974			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1975		}
1976
1977		/*
1978		 * free entries XAD[next:hwm]:
1979		 */
1980		if (hwm >= next) {
1981			/* init LOG_UPDATEMAP of the freed extents
1982			 * XAD[next:hwm] from the deleted page itself
1983			 * for logredo() to update bmap;
1984			 */
1985			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1986			lrd->log.updatemap.type =
1987			    cpu_to_le16(LOG_FREEXADLIST);
1988			xtlck = (struct xtlock *) & tlck->lock;
1989			hwm = xtlck->hwm.offset;
1990			lrd->log.updatemap.nxd =
1991			    cpu_to_le16(hwm - next + 1);
1992			/* reformat linelock for lmLog() */
1993			xtlck->header.offset = next;
1994			xtlck->header.length = hwm - next + 1;
1995			xtlck->index = 1;
1996			lrd->backchain =
1997			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1998		}
1999
2000		/*
2001		 *	format maplock(s) for txUpdateMap() to update bmap
2002		 */
2003		maplock->index = 0;
2004
2005		/*
2006		 * allocate entries XAD[lwm:next):
2007		 */
2008		if (lwm < next) {
2009			/* format a maplock for txUpdateMap() to update bPMAP
2010			 * for alloc of new/extended extents of XAD[lwm:next)
2011			 * from the page itself;
2012			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
2013			 */
2014			tlck->flag |= tlckUPDATEMAP;
2015			xadlock->flag = mlckALLOCXADLIST;
2016			xadlock->count = next - lwm;
2017			xadlock->xdlist = &p->xad[lwm];
2018
2019			jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d",
2020				 tlck->ip, mp, xadlock->count, lwm, next);
2021			maplock->index++;
2022			xadlock++;
2023		}
2024
2025		/*
2026		 * truncate entry XAD[twm == next - 1]:
2027		 */
2028		if (twm == next - 1) {
2029			/* format a maplock for txUpdateMap() to update bmap
2030			 * to free truncated delta extent of the truncated
2031			 * entry XAD[next - 1];
2032			 * (xtlck->pxdlock = truncated delta extent);
2033			 */
2034			tlck->flag |= tlckUPDATEMAP;
2035			pxdlock = (struct pxd_lock *) xadlock;
2036			pxdlock->flag = mlckFREEPXD;
2037			pxdlock->count = 1;
2038			pxdlock->pxd = pxd;
2039
2040			jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d",
2041				 ip, mp, pxdlock->count, hwm);
2042			maplock->index++;
2043			xadlock++;
2044		}
2045
2046		/*
2047		 * free entries XAD[next:hwm]:
2048		 */
2049		if (hwm >= next) {
2050			/* format a maplock for txUpdateMap() to update bmap
2051			 * to free extents of XAD[next:hwm] from thedeleted
2052			 * page itself;
2053			 */
2054			tlck->flag |= tlckUPDATEMAP;
2055			xadlock->flag = mlckFREEXADLIST;
2056			xadlock->count = hwm - next + 1;
2057			xadlock->xdlist = &p->xad[next];
2058
2059			jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d",
2060				 tlck->ip, mp, xadlock->count, next, hwm);
2061			maplock->index++;
2062		}
2063
2064		/* mark page as homeward bound */
2065		tlck->flag |= tlckWRITEPAGE;
2066	}
2067	return;
2068}
2069
2070/*
2071 *	mapLog()
2072 *
2073 * function:	log from maplock of freed data extents;
2074 */
2075static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2076		   struct tlock * tlck)
2077{
2078	struct pxd_lock *pxdlock;
2079	int i, nlock;
2080	pxd_t *pxd;
2081
2082	/*
2083	 *	page relocation: free the source page extent
2084	 *
2085	 * a maplock for txUpdateMap() for free of the page
2086	 * has been formatted at txLock() time saving the src
2087	 * relocated page address;
2088	 */
2089	if (tlck->type & tlckRELOCATE) {
2090		/* log LOG_NOREDOPAGE of the old relocated page
2091		 * for logredo() to start NoRedoPage filter;
2092		 */
2093		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
2094		pxdlock = (struct pxd_lock *) & tlck->lock;
2095		pxd = &lrd->log.redopage.pxd;
2096		*pxd = pxdlock->pxd;
2097		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2098
2099		/* (N.B. currently, logredo() does NOT update bmap
2100		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
2101		 * if page free from relocation, LOG_UPDATEMAP log is
2102		 * specifically generated now for logredo()
2103		 * to update bmap for free of src relocated page;
2104		 * (new flag LOG_RELOCATE may be introduced which will
2105		 * inform logredo() to start NORedoPage filter and also
2106		 * update block allocation map at the same time, thus
2107		 * avoiding an extra log write);
2108		 */
2109		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2110		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
2111		lrd->log.updatemap.nxd = cpu_to_le16(1);
2112		lrd->log.updatemap.pxd = pxdlock->pxd;
2113		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2114
2115		/* a maplock for txUpdateMap() for free of the page
2116		 * has been formatted at txLock() time;
2117		 */
2118		tlck->flag |= tlckUPDATEMAP;
2119		return;
2120	}
2121	/*
2122
2123	 * Otherwise it's not a relocate request
2124	 *
2125	 */
2126	else {
2127		/* log LOG_UPDATEMAP for logredo() to update bmap for
2128		 * free of truncated/relocated delta extent of the data;
2129		 * e.g.: external EA extent, relocated/truncated extent
2130		 * from xtTailgate();
2131		 */
2132		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2133		pxdlock = (struct pxd_lock *) & tlck->lock;
2134		nlock = pxdlock->index;
2135		for (i = 0; i < nlock; i++, pxdlock++) {
2136			if (pxdlock->flag & mlckALLOCPXD)
2137				lrd->log.updatemap.type =
2138				    cpu_to_le16(LOG_ALLOCPXD);
2139			else
2140				lrd->log.updatemap.type =
2141				    cpu_to_le16(LOG_FREEPXD);
2142			lrd->log.updatemap.nxd = cpu_to_le16(1);
2143			lrd->log.updatemap.pxd = pxdlock->pxd;
2144			lrd->backchain =
2145			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2146			jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
2147				 (ulong) addressPXD(&pxdlock->pxd),
2148				 lengthPXD(&pxdlock->pxd));
2149		}
2150
2151		/* update bmap */
2152		tlck->flag |= tlckUPDATEMAP;
2153	}
2154}
2155
2156/*
2157 *	txEA()
2158 *
2159 * function:	acquire maplock for EA/ACL extents or
2160 *		set COMMIT_INLINE flag;
2161 */
2162void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2163{
2164	struct tlock *tlck = NULL;
2165	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
2166
2167	/*
2168	 * format maplock for alloc of new EA extent
2169	 */
2170	if (newea) {
2171		/* Since the newea could be a completely zeroed entry we need to
2172		 * check for the two flags which indicate we should actually
2173		 * commit new EA data
2174		 */
2175		if (newea->flag & DXD_EXTENT) {
2176			tlck = txMaplock(tid, ip, tlckMAP);
2177			maplock = (struct pxd_lock *) & tlck->lock;
2178			pxdlock = (struct pxd_lock *) maplock;
2179			pxdlock->flag = mlckALLOCPXD;
2180			PXDaddress(&pxdlock->pxd, addressDXD(newea));
2181			PXDlength(&pxdlock->pxd, lengthDXD(newea));
2182			pxdlock++;
2183			maplock->index = 1;
2184		} else if (newea->flag & DXD_INLINE) {
2185			tlck = NULL;
2186
2187			set_cflag(COMMIT_Inlineea, ip);
2188		}
2189	}
2190
2191	/*
2192	 * format maplock for free of old EA extent
2193	 */
2194	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
2195		if (tlck == NULL) {
2196			tlck = txMaplock(tid, ip, tlckMAP);
2197			maplock = (struct pxd_lock *) & tlck->lock;
2198			pxdlock = (struct pxd_lock *) maplock;
2199			maplock->index = 0;
2200		}
2201		pxdlock->flag = mlckFREEPXD;
2202		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
2203		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
2204		maplock->index++;
2205	}
2206}
2207
2208/*
2209 *	txForce()
2210 *
2211 * function: synchronously write pages locked by transaction
2212 *	     after txLog() but before txUpdateMap();
2213 */
2214static void txForce(struct tblock * tblk)
2215{
2216	struct tlock *tlck;
2217	lid_t lid, next;
2218	struct metapage *mp;
2219
2220	/*
2221	 * reverse the order of transaction tlocks in
2222	 * careful update order of address index pages
2223	 * (right to left, bottom up)
2224	 */
2225	tlck = lid_to_tlock(tblk->next);
2226	lid = tlck->next;
2227	tlck->next = 0;
2228	while (lid) {
2229		tlck = lid_to_tlock(lid);
2230		next = tlck->next;
2231		tlck->next = tblk->next;
2232		tblk->next = lid;
2233		lid = next;
2234	}
2235
2236	/*
2237	 * synchronously write the page, and
2238	 * hold the page for txUpdateMap();
2239	 */
2240	for (lid = tblk->next; lid; lid = next) {
2241		tlck = lid_to_tlock(lid);
2242		next = tlck->next;
2243
2244		if ((mp = tlck->mp) != NULL &&
2245		    (tlck->type & tlckBTROOT) == 0) {
2246			assert(mp->xflag & COMMIT_PAGE);
2247
2248			if (tlck->flag & tlckWRITEPAGE) {
2249				tlck->flag &= ~tlckWRITEPAGE;
2250
2251				/* do not release page to freelist */
2252				force_metapage(mp);
2253#if 0
2254				/*
2255				 * The "right" thing to do here is to
2256				 * synchronously write the metadata.
2257				 * With the current implementation this
2258				 * is hard since write_metapage requires
2259				 * us to kunmap & remap the page.  If we
2260				 * have tlocks pointing into the metadata
2261				 * pages, we don't want to do this.  I think
2262				 * we can get by with synchronously writing
2263				 * the pages when they are released.
2264				 */
2265				assert(mp->nohomeok);
2266				set_bit(META_dirty, &mp->flag);
2267				set_bit(META_sync, &mp->flag);
2268#endif
2269			}
2270		}
2271	}
2272}
2273
2274/*
2275 *	txUpdateMap()
2276 *
2277 * function:	update persistent allocation map (and working map
2278 *		if appropriate);
2279 *
2280 * parameter:
2281 */
2282static void txUpdateMap(struct tblock * tblk)
2283{
2284	struct inode *ip;
2285	struct inode *ipimap;
2286	lid_t lid;
2287	struct tlock *tlck;
2288	struct maplock *maplock;
2289	struct pxd_lock pxdlock;
2290	int maptype;
2291	int k, nlock;
2292	struct metapage *mp = NULL;
2293
2294	ipimap = JFS_SBI(tblk->sb)->ipimap;
2295
2296	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
2297
2298
2299	/*
2300	 *	update block allocation map
2301	 *
2302	 * update allocation state in pmap (and wmap) and
2303	 * update lsn of the pmap page;
2304	 */
2305	/*
2306	 * scan each tlock/page of transaction for block allocation/free:
2307	 *
2308	 * for each tlock/page of transaction, update map.
2309	 *  ? are there tlock for pmap and pwmap at the same time ?
2310	 */
2311	for (lid = tblk->next; lid; lid = tlck->next) {
2312		tlck = lid_to_tlock(lid);
2313
2314		if ((tlck->flag & tlckUPDATEMAP) == 0)
2315			continue;
2316
2317		if (tlck->flag & tlckFREEPAGE) {
2318			/*
2319			 * Another thread may attempt to reuse freed space
2320			 * immediately, so we want to get rid of the metapage
2321			 * before anyone else has a chance to get it.
2322			 * Lock metapage, update maps, then invalidate
2323			 * the metapage.
2324			 */
2325			mp = tlck->mp;
2326			ASSERT(mp->xflag & COMMIT_PAGE);
2327			grab_metapage(mp);
2328		}
2329
2330		/*
2331		 * extent list:
2332		 * . in-line PXD list:
2333		 * . out-of-line XAD list:
2334		 */
2335		maplock = (struct maplock *) & tlck->lock;
2336		nlock = maplock->index;
2337
2338		for (k = 0; k < nlock; k++, maplock++) {
2339			/*
2340			 * allocate blocks in persistent map:
2341			 *
2342			 * blocks have been allocated from wmap at alloc time;
2343			 */
2344			if (maplock->flag & mlckALLOC) {
2345				txAllocPMap(ipimap, maplock, tblk);
2346			}
2347			/*
2348			 * free blocks in persistent and working map:
2349			 * blocks will be freed in pmap and then in wmap;
2350			 *
2351			 * ? tblock specifies the PMAP/PWMAP based upon
2352			 * transaction
2353			 *
2354			 * free blocks in persistent map:
2355			 * blocks will be freed from wmap at last reference
2356			 * release of the object for regular files;
2357			 *
2358			 * Alway free blocks from both persistent & working
2359			 * maps for directories
2360			 */
2361			else {	/* (maplock->flag & mlckFREE) */
2362
2363				if (tlck->flag & tlckDIRECTORY)
2364					txFreeMap(ipimap, maplock,
2365						  tblk, COMMIT_PWMAP);
2366				else
2367					txFreeMap(ipimap, maplock,
2368						  tblk, maptype);
2369			}
2370		}
2371		if (tlck->flag & tlckFREEPAGE) {
2372			if (!(tblk->flag & tblkGC_LAZY)) {
2373				/* This is equivalent to txRelease */
2374				ASSERT(mp->lid == lid);
2375				tlck->mp->lid = 0;
2376			}
2377			assert(mp->nohomeok == 1);
2378			metapage_homeok(mp);
2379			discard_metapage(mp);
2380			tlck->mp = NULL;
2381		}
2382	}
2383	/*
2384	 *	update inode allocation map
2385	 *
2386	 * update allocation state in pmap and
2387	 * update lsn of the pmap page;
2388	 * update in-memory inode flag/state
2389	 *
2390	 * unlock mapper/write lock
2391	 */
2392	if (tblk->xflag & COMMIT_CREATE) {
2393		diUpdatePMap(ipimap, tblk->ino, false, tblk);
2394		/* update persistent block allocation map
2395		 * for the allocation of inode extent;
2396		 */
2397		pxdlock.flag = mlckALLOCPXD;
2398		pxdlock.pxd = tblk->u.ixpxd;
2399		pxdlock.index = 1;
2400		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
2401	} else if (tblk->xflag & COMMIT_DELETE) {
2402		ip = tblk->u.ip;
2403		diUpdatePMap(ipimap, ip->i_ino, true, tblk);
2404		iput(ip);
2405	}
2406}
2407
2408/*
2409 *	txAllocPMap()
2410 *
2411 * function: allocate from persistent map;
2412 *
2413 * parameter:
2414 *	ipbmap	-
2415 *	malock	-
2416 *		xad list:
2417 *		pxd:
2418 *
2419 *	maptype -
2420 *		allocate from persistent map;
2421 *		free from persistent map;
2422 *		(e.g., tmp file - free from working map at releae
2423 *		 of last reference);
2424 *		free from persistent and working map;
2425 *
2426 *	lsn	- log sequence number;
2427 */
2428static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2429			struct tblock * tblk)
2430{
2431	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2432	struct xdlistlock *xadlistlock;
2433	xad_t *xad;
2434	s64 xaddr;
2435	int xlen;
2436	struct pxd_lock *pxdlock;
2437	struct xdlistlock *pxdlistlock;
2438	pxd_t *pxd;
2439	int n;
2440
2441	/*
2442	 * allocate from persistent map;
2443	 */
2444	if (maplock->flag & mlckALLOCXADLIST) {
2445		xadlistlock = (struct xdlistlock *) maplock;
2446		xad = xadlistlock->xdlist;
2447		for (n = 0; n < xadlistlock->count; n++, xad++) {
2448			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
2449				xaddr = addressXAD(xad);
2450				xlen = lengthXAD(xad);
2451				dbUpdatePMap(ipbmap, false, xaddr,
2452					     (s64) xlen, tblk);
2453				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
2454				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2455					 (ulong) xaddr, xlen);
2456			}
2457		}
2458	} else if (maplock->flag & mlckALLOCPXD) {
2459		pxdlock = (struct pxd_lock *) maplock;
2460		xaddr = addressPXD(&pxdlock->pxd);
2461		xlen = lengthPXD(&pxdlock->pxd);
2462		dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk);
2463		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
2464	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
2465
2466		pxdlistlock = (struct xdlistlock *) maplock;
2467		pxd = pxdlistlock->xdlist;
2468		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2469			xaddr = addressPXD(pxd);
2470			xlen = lengthPXD(pxd);
2471			dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen,
2472				     tblk);
2473			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2474				 (ulong) xaddr, xlen);
2475		}
2476	}
2477}
2478
2479/*
2480 *	txFreeMap()
2481 *
2482 * function:	free from persistent and/or working map;
2483 *
2484 * todo: optimization
2485 */
2486void txFreeMap(struct inode *ip,
2487	       struct maplock * maplock, struct tblock * tblk, int maptype)
2488{
2489	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2490	struct xdlistlock *xadlistlock;
2491	xad_t *xad;
2492	s64 xaddr;
2493	int xlen;
2494	struct pxd_lock *pxdlock;
2495	struct xdlistlock *pxdlistlock;
2496	pxd_t *pxd;
2497	int n;
2498
2499	jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
2500		 tblk, maplock, maptype);
2501
2502	/*
2503	 * free from persistent map;
2504	 */
2505	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
2506		if (maplock->flag & mlckFREEXADLIST) {
2507			xadlistlock = (struct xdlistlock *) maplock;
2508			xad = xadlistlock->xdlist;
2509			for (n = 0; n < xadlistlock->count; n++, xad++) {
2510				if (!(xad->flag & XAD_NEW)) {
2511					xaddr = addressXAD(xad);
2512					xlen = lengthXAD(xad);
2513					dbUpdatePMap(ipbmap, true, xaddr,
2514						     (s64) xlen, tblk);
2515					jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2516						 (ulong) xaddr, xlen);
2517				}
2518			}
2519		} else if (maplock->flag & mlckFREEPXD) {
2520			pxdlock = (struct pxd_lock *) maplock;
2521			xaddr = addressPXD(&pxdlock->pxd);
2522			xlen = lengthPXD(&pxdlock->pxd);
2523			dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen,
2524				     tblk);
2525			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2526				 (ulong) xaddr, xlen);
2527		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
2528
2529			pxdlistlock = (struct xdlistlock *) maplock;
2530			pxd = pxdlistlock->xdlist;
2531			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2532				xaddr = addressPXD(pxd);
2533				xlen = lengthPXD(pxd);
2534				dbUpdatePMap(ipbmap, true, xaddr,
2535					     (s64) xlen, tblk);
2536				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2537					 (ulong) xaddr, xlen);
2538			}
2539		}
2540	}
2541
2542	/*
2543	 * free from working map;
2544	 */
2545	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
2546		if (maplock->flag & mlckFREEXADLIST) {
2547			xadlistlock = (struct xdlistlock *) maplock;
2548			xad = xadlistlock->xdlist;
2549			for (n = 0; n < xadlistlock->count; n++, xad++) {
2550				xaddr = addressXAD(xad);
2551				xlen = lengthXAD(xad);
2552				dbFree(ip, xaddr, (s64) xlen);
2553				xad->flag = 0;
2554				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2555					 (ulong) xaddr, xlen);
2556			}
2557		} else if (maplock->flag & mlckFREEPXD) {
2558			pxdlock = (struct pxd_lock *) maplock;
2559			xaddr = addressPXD(&pxdlock->pxd);
2560			xlen = lengthPXD(&pxdlock->pxd);
2561			dbFree(ip, xaddr, (s64) xlen);
2562			jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2563				 (ulong) xaddr, xlen);
2564		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
2565
2566			pxdlistlock = (struct xdlistlock *) maplock;
2567			pxd = pxdlistlock->xdlist;
2568			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2569				xaddr = addressPXD(pxd);
2570				xlen = lengthPXD(pxd);
2571				dbFree(ip, xaddr, (s64) xlen);
2572				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2573					 (ulong) xaddr, xlen);
2574			}
2575		}
2576	}
2577}
2578
2579/*
2580 *	txFreelock()
2581 *
2582 * function:	remove tlock from inode anonymous locklist
2583 */
2584void txFreelock(struct inode *ip)
2585{
2586	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2587	struct tlock *xtlck, *tlck;
2588	lid_t xlid = 0, lid;
2589
2590	if (!jfs_ip->atlhead)
2591		return;
2592
2593	TXN_LOCK();
2594	xtlck = (struct tlock *) &jfs_ip->atlhead;
2595
2596	while ((lid = xtlck->next) != 0) {
2597		tlck = lid_to_tlock(lid);
2598		if (tlck->flag & tlckFREELOCK) {
2599			xtlck->next = tlck->next;
2600			txLockFree(lid);
2601		} else {
2602			xtlck = tlck;
2603			xlid = lid;
2604		}
2605	}
2606
2607	if (jfs_ip->atlhead)
2608		jfs_ip->atltail = xlid;
2609	else {
2610		jfs_ip->atltail = 0;
2611		/*
2612		 * If inode was on anon_list, remove it
2613		 */
2614		list_del_init(&jfs_ip->anon_inode_list);
2615	}
2616	TXN_UNLOCK();
2617}
2618
2619/*
2620 *	txAbort()
2621 *
2622 * function: abort tx before commit;
2623 *
2624 * frees line-locks and segment locks for all
2625 * segments in comdata structure.
2626 * Optionally sets state of file-system to FM_DIRTY in super-block.
2627 * log age of page-frames in memory for which caller has
2628 * are reset to 0 (to avoid logwarap).
2629 */
2630void txAbort(tid_t tid, int dirty)
2631{
2632	lid_t lid, next;
2633	struct metapage *mp;
2634	struct tblock *tblk = tid_to_tblock(tid);
2635	struct tlock *tlck;
2636
2637	/*
2638	 * free tlocks of the transaction
2639	 */
2640	for (lid = tblk->next; lid; lid = next) {
2641		tlck = lid_to_tlock(lid);
2642		next = tlck->next;
2643		mp = tlck->mp;
2644		JFS_IP(tlck->ip)->xtlid = 0;
2645
2646		if (mp) {
2647			mp->lid = 0;
2648
2649			/*
2650			 * reset lsn of page to avoid logwarap:
2651			 *
2652			 * (page may have been previously committed by another
2653			 * transaction(s) but has not been paged, i.e.,
2654			 * it may be on logsync list even though it has not
2655			 * been logged for the current tx.)
2656			 */
2657			if (mp->xflag & COMMIT_PAGE && mp->lsn)
2658				LogSyncRelease(mp);
2659		}
2660		/* insert tlock at head of freelist */
2661		TXN_LOCK();
2662		txLockFree(lid);
2663		TXN_UNLOCK();
2664	}
2665
2666	/* caller will free the transaction block */
2667
2668	tblk->next = tblk->last = 0;
2669
2670	/*
2671	 * mark filesystem dirty
2672	 */
2673	if (dirty)
2674		jfs_error(tblk->sb, "\n");
2675
2676	return;
2677}
2678
2679/*
2680 *	txLazyCommit(void)
2681 *
2682 *	All transactions except those changing ipimap (COMMIT_FORCE) are
2683 *	processed by this routine.  This insures that the inode and block
2684 *	allocation maps are updated in order.  For synchronous transactions,
2685 *	let the user thread finish processing after txUpdateMap() is called.
2686 */
2687static void txLazyCommit(struct tblock * tblk)
2688{
2689	struct jfs_log *log;
2690
2691	while (((tblk->flag & tblkGC_READY) == 0) &&
2692	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
2693		/* We must have gotten ahead of the user thread
2694		 */
2695		jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
2696		yield();
2697	}
2698
2699	jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
2700
2701	txUpdateMap(tblk);
2702
2703	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
2704
2705	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
2706
2707	tblk->flag |= tblkGC_COMMITTED;
2708
2709	if (tblk->flag & tblkGC_READY)
2710		log->gcrtc--;
2711
2712	wake_up_all(&tblk->gcwait);	// LOGGC_WAKEUP
2713
2714	/*
2715	 * Can't release log->gclock until we've tested tblk->flag
2716	 */
2717	if (tblk->flag & tblkGC_LAZY) {
2718		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2719		txUnlock(tblk);
2720		tblk->flag &= ~tblkGC_LAZY;
2721		txEnd(tblk - TxBlock);	/* Convert back to tid */
2722	} else
2723		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2724
2725	jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
2726}
2727
2728/*
2729 *	jfs_lazycommit(void)
2730 *
2731 *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
2732 *	context, or where blocking is not wanted, this routine will process
2733 *	committed transactions from the unlock queue.
2734 */
2735int jfs_lazycommit(void *arg)
2736{
2737	int WorkDone;
2738	struct tblock *tblk;
2739	unsigned long flags;
2740	struct jfs_sb_info *sbi;
2741
2742	do {
2743		LAZY_LOCK(flags);
2744		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
2745		while (!list_empty(&TxAnchor.unlock_queue)) {
2746			WorkDone = 0;
2747			list_for_each_entry(tblk, &TxAnchor.unlock_queue,
2748					    cqueue) {
2749
2750				sbi = JFS_SBI(tblk->sb);
2751				/*
2752				 * For each volume, the transactions must be
2753				 * handled in order.  If another commit thread
2754				 * is handling a tblk for this superblock,
2755				 * skip it
2756				 */
2757				if (sbi->commit_state & IN_LAZYCOMMIT)
2758					continue;
2759
2760				sbi->commit_state |= IN_LAZYCOMMIT;
2761				WorkDone = 1;
2762
2763				/*
2764				 * Remove transaction from queue
2765				 */
2766				list_del(&tblk->cqueue);
2767
2768				LAZY_UNLOCK(flags);
2769				txLazyCommit(tblk);
2770				LAZY_LOCK(flags);
2771
2772				sbi->commit_state &= ~IN_LAZYCOMMIT;
2773				/*
2774				 * Don't continue in the for loop.  (We can't
2775				 * anyway, it's unsafe!)  We want to go back to
2776				 * the beginning of the list.
2777				 */
2778				break;
2779			}
2780
2781			/* If there was nothing to do, don't continue */
2782			if (!WorkDone)
2783				break;
2784		}
2785		/* In case a wakeup came while all threads were active */
2786		jfs_commit_thread_waking = 0;
2787
2788		if (freezing(current)) {
2789			LAZY_UNLOCK(flags);
2790			try_to_freeze();
2791		} else {
2792			DECLARE_WAITQUEUE(wq, current);
2793
2794			add_wait_queue(&jfs_commit_thread_wait, &wq);
2795			set_current_state(TASK_INTERRUPTIBLE);
2796			LAZY_UNLOCK(flags);
2797			schedule();
2798			remove_wait_queue(&jfs_commit_thread_wait, &wq);
2799		}
2800	} while (!kthread_should_stop());
2801
2802	if (!list_empty(&TxAnchor.unlock_queue))
2803		jfs_err("jfs_lazycommit being killed w/pending transactions!");
2804	else
2805		jfs_info("jfs_lazycommit being killed");
2806	return 0;
2807}
2808
2809void txLazyUnlock(struct tblock * tblk)
2810{
2811	unsigned long flags;
2812
2813	LAZY_LOCK(flags);
2814
2815	list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
2816	/*
2817	 * Don't wake up a commit thread if there is already one servicing
2818	 * this superblock, or if the last one we woke up hasn't started yet.
2819	 */
2820	if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
2821	    !jfs_commit_thread_waking) {
2822		jfs_commit_thread_waking = 1;
2823		wake_up(&jfs_commit_thread_wait);
2824	}
2825	LAZY_UNLOCK(flags);
2826}
2827
2828static void LogSyncRelease(struct metapage * mp)
2829{
2830	struct jfs_log *log = mp->log;
2831
2832	assert(mp->nohomeok);
2833	assert(log);
2834	metapage_homeok(mp);
2835}
2836
2837/*
2838 *	txQuiesce
2839 *
2840 *	Block all new transactions and push anonymous transactions to
2841 *	completion
2842 *
2843 *	This does almost the same thing as jfs_sync below.  We don't
2844 *	worry about deadlocking when jfs_tlocks_low is set, since we would
2845 *	expect jfs_sync to get us out of that jam.
2846 */
2847void txQuiesce(struct super_block *sb)
2848{
2849	struct inode *ip;
2850	struct jfs_inode_info *jfs_ip;
2851	struct jfs_log *log = JFS_SBI(sb)->log;
2852	tid_t tid;
2853
2854	set_bit(log_QUIESCE, &log->flag);
2855
2856	TXN_LOCK();
2857restart:
2858	while (!list_empty(&TxAnchor.anon_list)) {
2859		jfs_ip = list_entry(TxAnchor.anon_list.next,
2860				    struct jfs_inode_info,
2861				    anon_inode_list);
2862		ip = &jfs_ip->vfs_inode;
2863
2864		/*
2865		 * inode will be removed from anonymous list
2866		 * when it is committed
2867		 */
2868		TXN_UNLOCK();
2869		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
2870		mutex_lock(&jfs_ip->commit_mutex);
2871		txCommit(tid, 1, &ip, 0);
2872		txEnd(tid);
2873		mutex_unlock(&jfs_ip->commit_mutex);
2874		/*
2875		 * Just to be safe.  I don't know how
2876		 * long we can run without blocking
2877		 */
2878		cond_resched();
2879		TXN_LOCK();
2880	}
2881
2882	/*
2883	 * If jfs_sync is running in parallel, there could be some inodes
2884	 * on anon_list2.  Let's check.
2885	 */
2886	if (!list_empty(&TxAnchor.anon_list2)) {
2887		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2888		goto restart;
2889	}
2890	TXN_UNLOCK();
2891
2892	/*
2893	 * We may need to kick off the group commit
2894	 */
2895	jfs_flush_journal(log, 0);
2896}
2897
2898/*
2899 * txResume()
2900 *
2901 * Allows transactions to start again following txQuiesce
2902 */
2903void txResume(struct super_block *sb)
2904{
2905	struct jfs_log *log = JFS_SBI(sb)->log;
2906
2907	clear_bit(log_QUIESCE, &log->flag);
2908	TXN_WAKEUP(&log->syncwait);
2909}
2910
2911/*
2912 *	jfs_sync(void)
2913 *
2914 *	To be run as a kernel daemon.  This is awakened when tlocks run low.
2915 *	We write any inodes that have anonymous tlocks so they will become
2916 *	available.
2917 */
2918int jfs_sync(void *arg)
2919{
2920	struct inode *ip;
2921	struct jfs_inode_info *jfs_ip;
2922	tid_t tid;
2923
2924	do {
2925		/*
2926		 * write each inode on the anonymous inode list
2927		 */
2928		TXN_LOCK();
2929		while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
2930			jfs_ip = list_entry(TxAnchor.anon_list.next,
2931					    struct jfs_inode_info,
2932					    anon_inode_list);
2933			ip = &jfs_ip->vfs_inode;
2934
2935			if (! igrab(ip)) {
2936				/*
2937				 * Inode is being freed
2938				 */
2939				list_del_init(&jfs_ip->anon_inode_list);
2940			} else if (mutex_trylock(&jfs_ip->commit_mutex)) {
2941				/*
2942				 * inode will be removed from anonymous list
2943				 * when it is committed
2944				 */
2945				TXN_UNLOCK();
2946				tid = txBegin(ip->i_sb, COMMIT_INODE);
2947				txCommit(tid, 1, &ip, 0);
2948				txEnd(tid);
2949				mutex_unlock(&jfs_ip->commit_mutex);
2950
2951				iput(ip);
2952				/*
2953				 * Just to be safe.  I don't know how
2954				 * long we can run without blocking
2955				 */
2956				cond_resched();
2957				TXN_LOCK();
2958			} else {
2959				/* We can't get the commit mutex.  It may
2960				 * be held by a thread waiting for tlock's
2961				 * so let's not block here.  Save it to
2962				 * put back on the anon_list.
2963				 */
2964
2965				/* Move from anon_list to anon_list2 */
2966				list_move(&jfs_ip->anon_inode_list,
2967					  &TxAnchor.anon_list2);
2968
2969				TXN_UNLOCK();
2970				iput(ip);
2971				TXN_LOCK();
2972			}
2973		}
2974		/* Add anon_list2 back to anon_list */
2975		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2976
2977		if (freezing(current)) {
2978			TXN_UNLOCK();
2979			try_to_freeze();
2980		} else {
2981			set_current_state(TASK_INTERRUPTIBLE);
2982			TXN_UNLOCK();
2983			schedule();
2984		}
2985	} while (!kthread_should_stop());
2986
2987	jfs_info("jfs_sync being killed");
2988	return 0;
2989}
2990
2991#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
2992int jfs_txanchor_proc_show(struct seq_file *m, void *v)
2993{
2994	char *freewait;
2995	char *freelockwait;
2996	char *lowlockwait;
2997
2998	freewait =
2999	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
3000	freelockwait =
3001	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
3002	lowlockwait =
3003	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3004
3005	seq_printf(m,
3006		       "JFS TxAnchor\n"
3007		       "============\n"
3008		       "freetid = %d\n"
3009		       "freewait = %s\n"
3010		       "freelock = %d\n"
3011		       "freelockwait = %s\n"
3012		       "lowlockwait = %s\n"
3013		       "tlocksInUse = %d\n"
3014		       "jfs_tlocks_low = %d\n"
3015		       "unlock_queue is %sempty\n",
3016		       TxAnchor.freetid,
3017		       freewait,
3018		       TxAnchor.freelock,
3019		       freelockwait,
3020		       lowlockwait,
3021		       TxAnchor.tlocksInUse,
3022		       jfs_tlocks_low,
3023		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
3024	return 0;
3025}
3026#endif
3027
3028#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
3029int jfs_txstats_proc_show(struct seq_file *m, void *v)
3030{
3031	seq_printf(m,
3032		       "JFS TxStats\n"
3033		       "===========\n"
3034		       "calls to txBegin = %d\n"
3035		       "txBegin blocked by sync barrier = %d\n"
3036		       "txBegin blocked by tlocks low = %d\n"
3037		       "txBegin blocked by no free tid = %d\n"
3038		       "calls to txBeginAnon = %d\n"
3039		       "txBeginAnon blocked by sync barrier = %d\n"
3040		       "txBeginAnon blocked by tlocks low = %d\n"
3041		       "calls to txLockAlloc = %d\n"
3042		       "tLockAlloc blocked by no free lock = %d\n",
3043		       TxStat.txBegin,
3044		       TxStat.txBegin_barrier,
3045		       TxStat.txBegin_lockslow,
3046		       TxStat.txBegin_freetid,
3047		       TxStat.txBeginAnon,
3048		       TxStat.txBeginAnon_barrier,
3049		       TxStat.txBeginAnon_lockslow,
3050		       TxStat.txLockAlloc,
3051		       TxStat.txLockAlloc_freelock);
3052	return 0;
3053}
3054#endif
3055