xref: /kernel/linux/linux-5.10/fs/ext4/mmp.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/fs.h>
3#include <linux/random.h>
4#include <linux/buffer_head.h>
5#include <linux/utsname.h>
6#include <linux/kthread.h>
7
8#include "ext4.h"
9
10/* Checksumming functions */
11static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
12{
13	struct ext4_sb_info *sbi = EXT4_SB(sb);
14	int offset = offsetof(struct mmp_struct, mmp_checksum);
15	__u32 csum;
16
17	csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
18
19	return cpu_to_le32(csum);
20}
21
22static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
23{
24	if (!ext4_has_metadata_csum(sb))
25		return 1;
26
27	return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
28}
29
30static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
31{
32	if (!ext4_has_metadata_csum(sb))
33		return;
34
35	mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
36}
37
38/*
39 * Write the MMP block using REQ_SYNC to try to get the block on-disk
40 * faster.
41 */
42static int write_mmp_block_thawed(struct super_block *sb,
43				  struct buffer_head *bh)
44{
45	struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
46
47	ext4_mmp_csum_set(sb, mmp);
48	lock_buffer(bh);
49	bh->b_end_io = end_buffer_write_sync;
50	get_bh(bh);
51	submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
52	wait_on_buffer(bh);
53	if (unlikely(!buffer_uptodate(bh)))
54		return -EIO;
55	return 0;
56}
57
58static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
59{
60	int err;
61
62	/*
63	 * We protect against freezing so that we don't create dirty buffers
64	 * on frozen filesystem.
65	 */
66	sb_start_write(sb);
67	err = write_mmp_block_thawed(sb, bh);
68	sb_end_write(sb);
69	return err;
70}
71
72/*
73 * Read the MMP block. It _must_ be read from disk and hence we clear the
74 * uptodate flag on the buffer.
75 */
76static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
77			  ext4_fsblk_t mmp_block)
78{
79	struct mmp_struct *mmp;
80	int ret;
81
82	if (*bh)
83		clear_buffer_uptodate(*bh);
84
85	/* This would be sb_bread(sb, mmp_block), except we need to be sure
86	 * that the MD RAID device cache has been bypassed, and that the read
87	 * is not blocked in the elevator. */
88	if (!*bh) {
89		*bh = sb_getblk(sb, mmp_block);
90		if (!*bh) {
91			ret = -ENOMEM;
92			goto warn_exit;
93		}
94	}
95
96	lock_buffer(*bh);
97	ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
98	if (ret)
99		goto warn_exit;
100
101	mmp = (struct mmp_struct *)((*bh)->b_data);
102	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
103		ret = -EFSCORRUPTED;
104		goto warn_exit;
105	}
106	if (!ext4_mmp_csum_verify(sb, mmp)) {
107		ret = -EFSBADCRC;
108		goto warn_exit;
109	}
110	return 0;
111warn_exit:
112	brelse(*bh);
113	*bh = NULL;
114	ext4_warning(sb, "Error %d while reading MMP block %llu",
115		     ret, mmp_block);
116	return ret;
117}
118
119/*
120 * Dump as much information as possible to help the admin.
121 */
122void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
123		    const char *function, unsigned int line, const char *msg)
124{
125	__ext4_warning(sb, function, line, "%s", msg);
126	__ext4_warning(sb, function, line,
127		       "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
128		       (unsigned long long)le64_to_cpu(mmp->mmp_time),
129		       (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
130		       (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
131}
132
133/*
134 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
135 */
136static int kmmpd(void *data)
137{
138	struct super_block *sb = (struct super_block *) data;
139	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
140	struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
141	struct mmp_struct *mmp;
142	ext4_fsblk_t mmp_block;
143	u32 seq = 0;
144	unsigned long failed_writes = 0;
145	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
146	unsigned mmp_check_interval;
147	unsigned long last_update_time;
148	unsigned long diff;
149	int retval = 0;
150
151	mmp_block = le64_to_cpu(es->s_mmp_block);
152	mmp = (struct mmp_struct *)(bh->b_data);
153	mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
154	/*
155	 * Start with the higher mmp_check_interval and reduce it if
156	 * the MMP block is being updated on time.
157	 */
158	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
159				 EXT4_MMP_MIN_CHECK_INTERVAL);
160	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
161	BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
162	bdevname(bh->b_bdev, mmp->mmp_bdevname);
163
164	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
165	       sizeof(mmp->mmp_nodename));
166
167	while (!kthread_should_stop() && !sb_rdonly(sb)) {
168		if (!ext4_has_feature_mmp(sb)) {
169			ext4_warning(sb, "kmmpd being stopped since MMP feature"
170				     " has been disabled.");
171			goto wait_to_exit;
172		}
173		if (++seq > EXT4_MMP_SEQ_MAX)
174			seq = 1;
175
176		mmp->mmp_seq = cpu_to_le32(seq);
177		mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
178		last_update_time = jiffies;
179
180		retval = write_mmp_block(sb, bh);
181		/*
182		 * Don't spew too many error messages. Print one every
183		 * (s_mmp_update_interval * 60) seconds.
184		 */
185		if (retval) {
186			if ((failed_writes % 60) == 0) {
187				ext4_error_err(sb, -retval,
188					       "Error writing to MMP block");
189			}
190			failed_writes++;
191		}
192
193		diff = jiffies - last_update_time;
194		if (diff < mmp_update_interval * HZ)
195			schedule_timeout_interruptible(mmp_update_interval *
196						       HZ - diff);
197
198		/*
199		 * We need to make sure that more than mmp_check_interval
200		 * seconds have not passed since writing. If that has happened
201		 * we need to check if the MMP block is as we left it.
202		 */
203		diff = jiffies - last_update_time;
204		if (diff > mmp_check_interval * HZ) {
205			struct buffer_head *bh_check = NULL;
206			struct mmp_struct *mmp_check;
207
208			retval = read_mmp_block(sb, &bh_check, mmp_block);
209			if (retval) {
210				ext4_error_err(sb, -retval,
211					       "error reading MMP data: %d",
212					       retval);
213				goto wait_to_exit;
214			}
215
216			mmp_check = (struct mmp_struct *)(bh_check->b_data);
217			if (mmp->mmp_seq != mmp_check->mmp_seq ||
218			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
219				   sizeof(mmp->mmp_nodename))) {
220				dump_mmp_msg(sb, mmp_check,
221					     "Error while updating MMP info. "
222					     "The filesystem seems to have been"
223					     " multiply mounted.");
224				ext4_error_err(sb, EBUSY, "abort");
225				put_bh(bh_check);
226				retval = -EBUSY;
227				goto wait_to_exit;
228			}
229			put_bh(bh_check);
230		}
231
232		 /*
233		 * Adjust the mmp_check_interval depending on how much time
234		 * it took for the MMP block to be written.
235		 */
236		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
237					     EXT4_MMP_MAX_CHECK_INTERVAL),
238					 EXT4_MMP_MIN_CHECK_INTERVAL);
239		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
240	}
241
242	/*
243	 * Unmount seems to be clean.
244	 */
245	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
246	mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
247
248	retval = write_mmp_block(sb, bh);
249
250wait_to_exit:
251	while (!kthread_should_stop()) {
252		set_current_state(TASK_INTERRUPTIBLE);
253		if (!kthread_should_stop())
254			schedule();
255	}
256	set_current_state(TASK_RUNNING);
257	return retval;
258}
259
260void ext4_stop_mmpd(struct ext4_sb_info *sbi)
261{
262	if (sbi->s_mmp_tsk) {
263		kthread_stop(sbi->s_mmp_tsk);
264		brelse(sbi->s_mmp_bh);
265		sbi->s_mmp_tsk = NULL;
266	}
267}
268
269/*
270 * Get a random new sequence number but make sure it is not greater than
271 * EXT4_MMP_SEQ_MAX.
272 */
273static unsigned int mmp_new_seq(void)
274{
275	u32 new_seq;
276
277	do {
278		new_seq = prandom_u32();
279	} while (new_seq > EXT4_MMP_SEQ_MAX);
280
281	return new_seq;
282}
283
284/*
285 * Protect the filesystem from being mounted more than once.
286 */
287int ext4_multi_mount_protect(struct super_block *sb,
288				    ext4_fsblk_t mmp_block)
289{
290	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
291	struct buffer_head *bh = NULL;
292	struct mmp_struct *mmp = NULL;
293	u32 seq;
294	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
295	unsigned int wait_time = 0;
296	int retval;
297
298	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
299	    mmp_block >= ext4_blocks_count(es)) {
300		ext4_warning(sb, "Invalid MMP block in superblock");
301		retval = -EINVAL;
302		goto failed;
303	}
304
305	retval = read_mmp_block(sb, &bh, mmp_block);
306	if (retval)
307		goto failed;
308
309	mmp = (struct mmp_struct *)(bh->b_data);
310
311	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
312		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
313
314	/*
315	 * If check_interval in MMP block is larger, use that instead of
316	 * update_interval from the superblock.
317	 */
318	if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
319		mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
320
321	seq = le32_to_cpu(mmp->mmp_seq);
322	if (seq == EXT4_MMP_SEQ_CLEAN)
323		goto skip;
324
325	if (seq == EXT4_MMP_SEQ_FSCK) {
326		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
327		retval = -EBUSY;
328		goto failed;
329	}
330
331	wait_time = min(mmp_check_interval * 2 + 1,
332			mmp_check_interval + 60);
333
334	/* Print MMP interval if more than 20 secs. */
335	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
336		ext4_warning(sb, "MMP interval %u higher than expected, please"
337			     " wait.\n", wait_time * 2);
338
339	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
340		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
341		retval = -ETIMEDOUT;
342		goto failed;
343	}
344
345	retval = read_mmp_block(sb, &bh, mmp_block);
346	if (retval)
347		goto failed;
348	mmp = (struct mmp_struct *)(bh->b_data);
349	if (seq != le32_to_cpu(mmp->mmp_seq)) {
350		dump_mmp_msg(sb, mmp,
351			     "Device is already active on another node.");
352		retval = -EBUSY;
353		goto failed;
354	}
355
356skip:
357	/*
358	 * write a new random sequence number.
359	 */
360	seq = mmp_new_seq();
361	mmp->mmp_seq = cpu_to_le32(seq);
362
363	/*
364	 * On mount / remount we are protected against fs freezing (by s_umount
365	 * semaphore) and grabbing freeze protection upsets lockdep
366	 */
367	retval = write_mmp_block_thawed(sb, bh);
368	if (retval)
369		goto failed;
370
371	/*
372	 * wait for MMP interval and check mmp_seq.
373	 */
374	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
375		ext4_warning(sb, "MMP startup interrupted, failing mount");
376		retval = -ETIMEDOUT;
377		goto failed;
378	}
379
380	retval = read_mmp_block(sb, &bh, mmp_block);
381	if (retval)
382		goto failed;
383	mmp = (struct mmp_struct *)(bh->b_data);
384	if (seq != le32_to_cpu(mmp->mmp_seq)) {
385		dump_mmp_msg(sb, mmp,
386			     "Device is already active on another node.");
387		retval = -EBUSY;
388		goto failed;
389	}
390
391	EXT4_SB(sb)->s_mmp_bh = bh;
392
393	/*
394	 * Start a kernel thread to update the MMP block periodically.
395	 */
396	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
397					     (int)sizeof(mmp->mmp_bdevname),
398					     bdevname(bh->b_bdev,
399						      mmp->mmp_bdevname));
400	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
401		EXT4_SB(sb)->s_mmp_tsk = NULL;
402		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
403			     sb->s_id);
404		retval = -ENOMEM;
405		goto failed;
406	}
407
408	return 0;
409
410failed:
411	brelse(bh);
412	return retval;
413}
414