xref: /kernel/linux/linux-6.6/fs/ext4/super.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 *  linux/fs/ext4/super.c
4 *
5 * Copyright (C) 1992, 1993, 1994, 1995
6 * Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 *
10 *  from
11 *
12 *  linux/fs/minix/inode.c
13 *
14 *  Copyright (C) 1991, 1992  Linus Torvalds
15 *
16 *  Big-endian to little-endian byte-swapping/bitmaps by
17 *        David S. Miller (davem@caip.rutgers.edu), 1995
18 */
19
20#include <linux/module.h>
21#include <linux/string.h>
22#include <linux/fs.h>
23#include <linux/time.h>
24#include <linux/vmalloc.h>
25#include <linux/slab.h>
26#include <linux/init.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/parser.h>
30#include <linux/buffer_head.h>
31#include <linux/exportfs.h>
32#include <linux/vfs.h>
33#include <linux/random.h>
34#include <linux/mount.h>
35#include <linux/namei.h>
36#include <linux/quotaops.h>
37#include <linux/seq_file.h>
38#include <linux/ctype.h>
39#include <linux/log2.h>
40#include <linux/crc16.h>
41#include <linux/dax.h>
42#include <linux/uaccess.h>
43#include <linux/iversion.h>
44#include <linux/unicode.h>
45#include <linux/part_stat.h>
46#include <linux/kthread.h>
47#include <linux/freezer.h>
48#include <linux/fsnotify.h>
49#include <linux/fs_context.h>
50#include <linux/fs_parser.h>
51
52#include "ext4.h"
53#include "ext4_extents.h"	/* Needed for trace points definition */
54#include "ext4_jbd2.h"
55#include "xattr.h"
56#include "acl.h"
57#include "mballoc.h"
58#include "fsmap.h"
59
60#define CREATE_TRACE_POINTS
61#include <trace/events/ext4.h>
62
63static struct ext4_lazy_init *ext4_li_info;
64static DEFINE_MUTEX(ext4_li_mtx);
65static struct ratelimit_state ext4_mount_msg_ratelimit;
66
67static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
68			     unsigned long journal_devnum);
69static int ext4_show_options(struct seq_file *seq, struct dentry *root);
70static void ext4_update_super(struct super_block *sb);
71static int ext4_commit_super(struct super_block *sb);
72static int ext4_mark_recovery_complete(struct super_block *sb,
73					struct ext4_super_block *es);
74static int ext4_clear_journal_err(struct super_block *sb,
75				  struct ext4_super_block *es);
76static int ext4_sync_fs(struct super_block *sb, int wait);
77static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
78static int ext4_unfreeze(struct super_block *sb);
79static int ext4_freeze(struct super_block *sb);
80static inline int ext2_feature_set_ok(struct super_block *sb);
81static inline int ext3_feature_set_ok(struct super_block *sb);
82static void ext4_destroy_lazyinit_thread(void);
83static void ext4_unregister_li_request(struct super_block *sb);
84static void ext4_clear_request_list(void);
85static struct inode *ext4_get_journal_inode(struct super_block *sb,
86					    unsigned int journal_inum);
87static int ext4_validate_options(struct fs_context *fc);
88static int ext4_check_opt_consistency(struct fs_context *fc,
89				      struct super_block *sb);
90static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
91static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
92static int ext4_get_tree(struct fs_context *fc);
93static int ext4_reconfigure(struct fs_context *fc);
94static void ext4_fc_free(struct fs_context *fc);
95static int ext4_init_fs_context(struct fs_context *fc);
96static void ext4_kill_sb(struct super_block *sb);
97static const struct fs_parameter_spec ext4_param_specs[];
98
99/*
100 * Lock ordering
101 *
102 * page fault path:
103 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
104 *   -> page lock -> i_data_sem (rw)
105 *
106 * buffered write path:
107 * sb_start_write -> i_mutex -> mmap_lock
108 * sb_start_write -> i_mutex -> transaction start -> page lock ->
109 *   i_data_sem (rw)
110 *
111 * truncate:
112 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
113 *   page lock
114 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
115 *   i_data_sem (rw)
116 *
117 * direct IO:
118 * sb_start_write -> i_mutex -> mmap_lock
119 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
120 *
121 * writepages:
122 * transaction start -> page lock(s) -> i_data_sem (rw)
123 */
124
125static const struct fs_context_operations ext4_context_ops = {
126	.parse_param	= ext4_parse_param,
127	.get_tree	= ext4_get_tree,
128	.reconfigure	= ext4_reconfigure,
129	.free		= ext4_fc_free,
130};
131
132
133#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
134static struct file_system_type ext2_fs_type = {
135	.owner			= THIS_MODULE,
136	.name			= "ext2",
137	.init_fs_context	= ext4_init_fs_context,
138	.parameters		= ext4_param_specs,
139	.kill_sb		= ext4_kill_sb,
140	.fs_flags		= FS_REQUIRES_DEV,
141};
142MODULE_ALIAS_FS("ext2");
143MODULE_ALIAS("ext2");
144#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
145#else
146#define IS_EXT2_SB(sb) (0)
147#endif
148
149
150static struct file_system_type ext3_fs_type = {
151	.owner			= THIS_MODULE,
152	.name			= "ext3",
153	.init_fs_context	= ext4_init_fs_context,
154	.parameters		= ext4_param_specs,
155	.kill_sb		= ext4_kill_sb,
156	.fs_flags		= FS_REQUIRES_DEV,
157};
158MODULE_ALIAS_FS("ext3");
159MODULE_ALIAS("ext3");
160#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)
161
162
163static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
164				  bh_end_io_t *end_io)
165{
166	/*
167	 * buffer's verified bit is no longer valid after reading from
168	 * disk again due to write out error, clear it to make sure we
169	 * recheck the buffer contents.
170	 */
171	clear_buffer_verified(bh);
172
173	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
174	get_bh(bh);
175	submit_bh(REQ_OP_READ | op_flags, bh);
176}
177
178void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
179			 bh_end_io_t *end_io)
180{
181	BUG_ON(!buffer_locked(bh));
182
183	if (ext4_buffer_uptodate(bh)) {
184		unlock_buffer(bh);
185		return;
186	}
187	__ext4_read_bh(bh, op_flags, end_io);
188}
189
190int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
191{
192	BUG_ON(!buffer_locked(bh));
193
194	if (ext4_buffer_uptodate(bh)) {
195		unlock_buffer(bh);
196		return 0;
197	}
198
199	__ext4_read_bh(bh, op_flags, end_io);
200
201	wait_on_buffer(bh);
202	if (buffer_uptodate(bh))
203		return 0;
204	return -EIO;
205}
206
207int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
208{
209	lock_buffer(bh);
210	if (!wait) {
211		ext4_read_bh_nowait(bh, op_flags, NULL);
212		return 0;
213	}
214	return ext4_read_bh(bh, op_flags, NULL);
215}
216
217/*
218 * This works like __bread_gfp() except it uses ERR_PTR for error
219 * returns.  Currently with sb_bread it's impossible to distinguish
220 * between ENOMEM and EIO situations (since both result in a NULL
221 * return.
222 */
223static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
224					       sector_t block,
225					       blk_opf_t op_flags, gfp_t gfp)
226{
227	struct buffer_head *bh;
228	int ret;
229
230	bh = sb_getblk_gfp(sb, block, gfp);
231	if (bh == NULL)
232		return ERR_PTR(-ENOMEM);
233	if (ext4_buffer_uptodate(bh))
234		return bh;
235
236	ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
237	if (ret) {
238		put_bh(bh);
239		return ERR_PTR(ret);
240	}
241	return bh;
242}
243
244struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
245				   blk_opf_t op_flags)
246{
247	return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
248}
249
250struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
251					    sector_t block)
252{
253	return __ext4_sb_bread_gfp(sb, block, 0, 0);
254}
255
256void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
257{
258	struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
259
260	if (likely(bh)) {
261		if (trylock_buffer(bh))
262			ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
263		brelse(bh);
264	}
265}
266
267static int ext4_verify_csum_type(struct super_block *sb,
268				 struct ext4_super_block *es)
269{
270	if (!ext4_has_feature_metadata_csum(sb))
271		return 1;
272
273	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
274}
275
276__le32 ext4_superblock_csum(struct super_block *sb,
277			    struct ext4_super_block *es)
278{
279	struct ext4_sb_info *sbi = EXT4_SB(sb);
280	int offset = offsetof(struct ext4_super_block, s_checksum);
281	__u32 csum;
282
283	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
284
285	return cpu_to_le32(csum);
286}
287
288static int ext4_superblock_csum_verify(struct super_block *sb,
289				       struct ext4_super_block *es)
290{
291	if (!ext4_has_metadata_csum(sb))
292		return 1;
293
294	return es->s_checksum == ext4_superblock_csum(sb, es);
295}
296
297void ext4_superblock_csum_set(struct super_block *sb)
298{
299	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
300
301	if (!ext4_has_metadata_csum(sb))
302		return;
303
304	es->s_checksum = ext4_superblock_csum(sb, es);
305}
306
307ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
308			       struct ext4_group_desc *bg)
309{
310	return le32_to_cpu(bg->bg_block_bitmap_lo) |
311		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
312		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
313}
314
315ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
316			       struct ext4_group_desc *bg)
317{
318	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
319		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
320		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
321}
322
323ext4_fsblk_t ext4_inode_table(struct super_block *sb,
324			      struct ext4_group_desc *bg)
325{
326	return le32_to_cpu(bg->bg_inode_table_lo) |
327		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
328		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
329}
330
331__u32 ext4_free_group_clusters(struct super_block *sb,
332			       struct ext4_group_desc *bg)
333{
334	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
335		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
336		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
337}
338
339__u32 ext4_free_inodes_count(struct super_block *sb,
340			      struct ext4_group_desc *bg)
341{
342	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
343		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
344		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
345}
346
347__u32 ext4_used_dirs_count(struct super_block *sb,
348			      struct ext4_group_desc *bg)
349{
350	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
351		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
352		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
353}
354
355__u32 ext4_itable_unused_count(struct super_block *sb,
356			      struct ext4_group_desc *bg)
357{
358	return le16_to_cpu(bg->bg_itable_unused_lo) |
359		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
360		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
361}
362
363void ext4_block_bitmap_set(struct super_block *sb,
364			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
365{
366	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
367	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
368		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
369}
370
371void ext4_inode_bitmap_set(struct super_block *sb,
372			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
373{
374	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
375	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
376		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
377}
378
379void ext4_inode_table_set(struct super_block *sb,
380			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
381{
382	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
383	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
384		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
385}
386
387void ext4_free_group_clusters_set(struct super_block *sb,
388				  struct ext4_group_desc *bg, __u32 count)
389{
390	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
391	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
392		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
393}
394
395void ext4_free_inodes_set(struct super_block *sb,
396			  struct ext4_group_desc *bg, __u32 count)
397{
398	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
399	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
400		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
401}
402
403void ext4_used_dirs_set(struct super_block *sb,
404			  struct ext4_group_desc *bg, __u32 count)
405{
406	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
407	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
408		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
409}
410
411void ext4_itable_unused_set(struct super_block *sb,
412			  struct ext4_group_desc *bg, __u32 count)
413{
414	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
415	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
416		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
417}
418
419static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
420{
421	now = clamp_val(now, 0, (1ull << 40) - 1);
422
423	*lo = cpu_to_le32(lower_32_bits(now));
424	*hi = upper_32_bits(now);
425}
426
427static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
428{
429	return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
430}
431#define ext4_update_tstamp(es, tstamp) \
432	__ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
433			     ktime_get_real_seconds())
434#define ext4_get_tstamp(es, tstamp) \
435	__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
436
437#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
438#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
439
440/*
441 * The ext4_maybe_update_superblock() function checks and updates the
442 * superblock if needed.
443 *
444 * This function is designed to update the on-disk superblock only under
445 * certain conditions to prevent excessive disk writes and unnecessary
446 * waking of the disk from sleep. The superblock will be updated if:
447 * 1. More than an hour has passed since the last superblock update, and
448 * 2. More than 16MB have been written since the last superblock update.
449 *
450 * @sb: The superblock
451 */
452static void ext4_maybe_update_superblock(struct super_block *sb)
453{
454	struct ext4_sb_info *sbi = EXT4_SB(sb);
455	struct ext4_super_block *es = sbi->s_es;
456	journal_t *journal = sbi->s_journal;
457	time64_t now;
458	__u64 last_update;
459	__u64 lifetime_write_kbytes;
460	__u64 diff_size;
461
462	if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
463	    !journal || (journal->j_flags & JBD2_UNMOUNT))
464		return;
465
466	now = ktime_get_real_seconds();
467	last_update = ext4_get_tstamp(es, s_wtime);
468
469	if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
470		return;
471
472	lifetime_write_kbytes = sbi->s_kbytes_written +
473		((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
474		  sbi->s_sectors_written_start) >> 1);
475
476	/* Get the number of kilobytes not written to disk to account
477	 * for statistics and compare with a multiple of 16 MB. This
478	 * is used to determine when the next superblock commit should
479	 * occur (i.e. not more often than once per 16MB if there was
480	 * less written in an hour).
481	 */
482	diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
483
484	if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
485		schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
486}
487
488/*
489 * The del_gendisk() function uninitializes the disk-specific data
490 * structures, including the bdi structure, without telling anyone
491 * else.  Once this happens, any attempt to call mark_buffer_dirty()
492 * (for example, by ext4_commit_super), will cause a kernel OOPS.
493 * This is a kludge to prevent these oops until we can put in a proper
494 * hook in del_gendisk() to inform the VFS and file system layers.
495 */
496static int block_device_ejected(struct super_block *sb)
497{
498	struct inode *bd_inode = sb->s_bdev->bd_inode;
499	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
500
501	return bdi->dev == NULL;
502}
503
504static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
505{
506	struct super_block		*sb = journal->j_private;
507	struct ext4_sb_info		*sbi = EXT4_SB(sb);
508	int				error = is_journal_aborted(journal);
509	struct ext4_journal_cb_entry	*jce;
510
511	BUG_ON(txn->t_state == T_FINISHED);
512
513	ext4_process_freed_data(sb, txn->t_tid);
514	ext4_maybe_update_superblock(sb);
515
516	spin_lock(&sbi->s_md_lock);
517	while (!list_empty(&txn->t_private_list)) {
518		jce = list_entry(txn->t_private_list.next,
519				 struct ext4_journal_cb_entry, jce_list);
520		list_del_init(&jce->jce_list);
521		spin_unlock(&sbi->s_md_lock);
522		jce->jce_func(sb, jce, error);
523		spin_lock(&sbi->s_md_lock);
524	}
525	spin_unlock(&sbi->s_md_lock);
526}
527
528/*
529 * This writepage callback for write_cache_pages()
530 * takes care of a few cases after page cleaning.
531 *
532 * write_cache_pages() already checks for dirty pages
533 * and calls clear_page_dirty_for_io(), which we want,
534 * to write protect the pages.
535 *
536 * However, we may have to redirty a page (see below.)
537 */
538static int ext4_journalled_writepage_callback(struct folio *folio,
539					      struct writeback_control *wbc,
540					      void *data)
541{
542	transaction_t *transaction = (transaction_t *) data;
543	struct buffer_head *bh, *head;
544	struct journal_head *jh;
545
546	bh = head = folio_buffers(folio);
547	do {
548		/*
549		 * We have to redirty a page in these cases:
550		 * 1) If buffer is dirty, it means the page was dirty because it
551		 * contains a buffer that needs checkpointing. So the dirty bit
552		 * needs to be preserved so that checkpointing writes the buffer
553		 * properly.
554		 * 2) If buffer is not part of the committing transaction
555		 * (we may have just accidentally come across this buffer because
556		 * inode range tracking is not exact) or if the currently running
557		 * transaction already contains this buffer as well, dirty bit
558		 * needs to be preserved so that the buffer gets writeprotected
559		 * properly on running transaction's commit.
560		 */
561		jh = bh2jh(bh);
562		if (buffer_dirty(bh) ||
563		    (jh && (jh->b_transaction != transaction ||
564			    jh->b_next_transaction))) {
565			folio_redirty_for_writepage(wbc, folio);
566			goto out;
567		}
568	} while ((bh = bh->b_this_page) != head);
569
570out:
571	return AOP_WRITEPAGE_ACTIVATE;
572}
573
574static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
575{
576	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
577	struct writeback_control wbc = {
578		.sync_mode =  WB_SYNC_ALL,
579		.nr_to_write = LONG_MAX,
580		.range_start = jinode->i_dirty_start,
581		.range_end = jinode->i_dirty_end,
582        };
583
584	return write_cache_pages(mapping, &wbc,
585				 ext4_journalled_writepage_callback,
586				 jinode->i_transaction);
587}
588
589static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
590{
591	int ret;
592
593	if (ext4_should_journal_data(jinode->i_vfs_inode))
594		ret = ext4_journalled_submit_inode_data_buffers(jinode);
595	else
596		ret = ext4_normal_submit_inode_data_buffers(jinode);
597	return ret;
598}
599
600static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
601{
602	int ret = 0;
603
604	if (!ext4_should_journal_data(jinode->i_vfs_inode))
605		ret = jbd2_journal_finish_inode_data_buffers(jinode);
606
607	return ret;
608}
609
610static bool system_going_down(void)
611{
612	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
613		|| system_state == SYSTEM_RESTART;
614}
615
616struct ext4_err_translation {
617	int code;
618	int errno;
619};
620
621#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
622
623static struct ext4_err_translation err_translation[] = {
624	EXT4_ERR_TRANSLATE(EIO),
625	EXT4_ERR_TRANSLATE(ENOMEM),
626	EXT4_ERR_TRANSLATE(EFSBADCRC),
627	EXT4_ERR_TRANSLATE(EFSCORRUPTED),
628	EXT4_ERR_TRANSLATE(ENOSPC),
629	EXT4_ERR_TRANSLATE(ENOKEY),
630	EXT4_ERR_TRANSLATE(EROFS),
631	EXT4_ERR_TRANSLATE(EFBIG),
632	EXT4_ERR_TRANSLATE(EEXIST),
633	EXT4_ERR_TRANSLATE(ERANGE),
634	EXT4_ERR_TRANSLATE(EOVERFLOW),
635	EXT4_ERR_TRANSLATE(EBUSY),
636	EXT4_ERR_TRANSLATE(ENOTDIR),
637	EXT4_ERR_TRANSLATE(ENOTEMPTY),
638	EXT4_ERR_TRANSLATE(ESHUTDOWN),
639	EXT4_ERR_TRANSLATE(EFAULT),
640};
641
642static int ext4_errno_to_code(int errno)
643{
644	int i;
645
646	for (i = 0; i < ARRAY_SIZE(err_translation); i++)
647		if (err_translation[i].errno == errno)
648			return err_translation[i].code;
649	return EXT4_ERR_UNKNOWN;
650}
651
652static void save_error_info(struct super_block *sb, int error,
653			    __u32 ino, __u64 block,
654			    const char *func, unsigned int line)
655{
656	struct ext4_sb_info *sbi = EXT4_SB(sb);
657
658	/* We default to EFSCORRUPTED error... */
659	if (error == 0)
660		error = EFSCORRUPTED;
661
662	spin_lock(&sbi->s_error_lock);
663	sbi->s_add_error_count++;
664	sbi->s_last_error_code = error;
665	sbi->s_last_error_line = line;
666	sbi->s_last_error_ino = ino;
667	sbi->s_last_error_block = block;
668	sbi->s_last_error_func = func;
669	sbi->s_last_error_time = ktime_get_real_seconds();
670	if (!sbi->s_first_error_time) {
671		sbi->s_first_error_code = error;
672		sbi->s_first_error_line = line;
673		sbi->s_first_error_ino = ino;
674		sbi->s_first_error_block = block;
675		sbi->s_first_error_func = func;
676		sbi->s_first_error_time = sbi->s_last_error_time;
677	}
678	spin_unlock(&sbi->s_error_lock);
679}
680
681/* Deal with the reporting of failure conditions on a filesystem such as
682 * inconsistencies detected or read IO failures.
683 *
684 * On ext2, we can store the error state of the filesystem in the
685 * superblock.  That is not possible on ext4, because we may have other
686 * write ordering constraints on the superblock which prevent us from
687 * writing it out straight away; and given that the journal is about to
688 * be aborted, we can't rely on the current, or future, transactions to
689 * write out the superblock safely.
690 *
691 * We'll just use the jbd2_journal_abort() error code to record an error in
692 * the journal instead.  On recovery, the journal will complain about
693 * that error until we've noted it down and cleared it.
694 *
695 * If force_ro is set, we unconditionally force the filesystem into an
696 * ABORT|READONLY state, unless the error response on the fs has been set to
697 * panic in which case we take the easy way out and panic immediately. This is
698 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
699 * at a critical moment in log management.
700 */
701static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
702			      __u32 ino, __u64 block,
703			      const char *func, unsigned int line)
704{
705	journal_t *journal = EXT4_SB(sb)->s_journal;
706	bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
707
708	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
709	if (test_opt(sb, WARN_ON_ERROR))
710		WARN_ON_ONCE(1);
711
712	if (!continue_fs && !sb_rdonly(sb)) {
713		set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
714		if (journal)
715			jbd2_journal_abort(journal, -EIO);
716	}
717
718	if (!bdev_read_only(sb->s_bdev)) {
719		save_error_info(sb, error, ino, block, func, line);
720		/*
721		 * In case the fs should keep running, we need to writeout
722		 * superblock through the journal. Due to lock ordering
723		 * constraints, it may not be safe to do it right here so we
724		 * defer superblock flushing to a workqueue.
725		 */
726		if (continue_fs && journal)
727			schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
728		else
729			ext4_commit_super(sb);
730	}
731
732	/*
733	 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
734	 * could panic during 'reboot -f' as the underlying device got already
735	 * disabled.
736	 */
737	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
738		panic("EXT4-fs (device %s): panic forced after error\n",
739			sb->s_id);
740	}
741
742	if (sb_rdonly(sb) || continue_fs)
743		return;
744
745	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
746	/*
747	 * Make sure updated value of ->s_mount_flags will be visible before
748	 * ->s_flags update
749	 */
750	smp_wmb();
751	sb->s_flags |= SB_RDONLY;
752}
753
754static void update_super_work(struct work_struct *work)
755{
756	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
757						s_sb_upd_work);
758	journal_t *journal = sbi->s_journal;
759	handle_t *handle;
760
761	/*
762	 * If the journal is still running, we have to write out superblock
763	 * through the journal to avoid collisions of other journalled sb
764	 * updates.
765	 *
766	 * We use directly jbd2 functions here to avoid recursing back into
767	 * ext4 error handling code during handling of previous errors.
768	 */
769	if (!sb_rdonly(sbi->s_sb) && journal) {
770		struct buffer_head *sbh = sbi->s_sbh;
771		bool call_notify_err = false;
772
773		handle = jbd2_journal_start(journal, 1);
774		if (IS_ERR(handle))
775			goto write_directly;
776		if (jbd2_journal_get_write_access(handle, sbh)) {
777			jbd2_journal_stop(handle);
778			goto write_directly;
779		}
780
781		if (sbi->s_add_error_count > 0)
782			call_notify_err = true;
783
784		ext4_update_super(sbi->s_sb);
785		if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
786			ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
787				 "superblock detected");
788			clear_buffer_write_io_error(sbh);
789			set_buffer_uptodate(sbh);
790		}
791
792		if (jbd2_journal_dirty_metadata(handle, sbh)) {
793			jbd2_journal_stop(handle);
794			goto write_directly;
795		}
796		jbd2_journal_stop(handle);
797
798		if (call_notify_err)
799			ext4_notify_error_sysfs(sbi);
800
801		return;
802	}
803write_directly:
804	/*
805	 * Write through journal failed. Write sb directly to get error info
806	 * out and hope for the best.
807	 */
808	ext4_commit_super(sbi->s_sb);
809	ext4_notify_error_sysfs(sbi);
810}
811
812#define ext4_error_ratelimit(sb)					\
813		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
814			     "EXT4-fs error")
815
816void __ext4_error(struct super_block *sb, const char *function,
817		  unsigned int line, bool force_ro, int error, __u64 block,
818		  const char *fmt, ...)
819{
820	struct va_format vaf;
821	va_list args;
822
823	if (unlikely(ext4_forced_shutdown(sb)))
824		return;
825
826	trace_ext4_error(sb, function, line);
827	if (ext4_error_ratelimit(sb)) {
828		va_start(args, fmt);
829		vaf.fmt = fmt;
830		vaf.va = &args;
831		printk(KERN_CRIT
832		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
833		       sb->s_id, function, line, current->comm, &vaf);
834		va_end(args);
835	}
836	fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
837
838	ext4_handle_error(sb, force_ro, error, 0, block, function, line);
839}
840
841void __ext4_error_inode(struct inode *inode, const char *function,
842			unsigned int line, ext4_fsblk_t block, int error,
843			const char *fmt, ...)
844{
845	va_list args;
846	struct va_format vaf;
847
848	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
849		return;
850
851	trace_ext4_error(inode->i_sb, function, line);
852	if (ext4_error_ratelimit(inode->i_sb)) {
853		va_start(args, fmt);
854		vaf.fmt = fmt;
855		vaf.va = &args;
856		if (block)
857			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
858			       "inode #%lu: block %llu: comm %s: %pV\n",
859			       inode->i_sb->s_id, function, line, inode->i_ino,
860			       block, current->comm, &vaf);
861		else
862			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
863			       "inode #%lu: comm %s: %pV\n",
864			       inode->i_sb->s_id, function, line, inode->i_ino,
865			       current->comm, &vaf);
866		va_end(args);
867	}
868	fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
869
870	ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
871			  function, line);
872}
873
874void __ext4_error_file(struct file *file, const char *function,
875		       unsigned int line, ext4_fsblk_t block,
876		       const char *fmt, ...)
877{
878	va_list args;
879	struct va_format vaf;
880	struct inode *inode = file_inode(file);
881	char pathname[80], *path;
882
883	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
884		return;
885
886	trace_ext4_error(inode->i_sb, function, line);
887	if (ext4_error_ratelimit(inode->i_sb)) {
888		path = file_path(file, pathname, sizeof(pathname));
889		if (IS_ERR(path))
890			path = "(unknown)";
891		va_start(args, fmt);
892		vaf.fmt = fmt;
893		vaf.va = &args;
894		if (block)
895			printk(KERN_CRIT
896			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
897			       "block %llu: comm %s: path %s: %pV\n",
898			       inode->i_sb->s_id, function, line, inode->i_ino,
899			       block, current->comm, path, &vaf);
900		else
901			printk(KERN_CRIT
902			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
903			       "comm %s: path %s: %pV\n",
904			       inode->i_sb->s_id, function, line, inode->i_ino,
905			       current->comm, path, &vaf);
906		va_end(args);
907	}
908	fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
909
910	ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
911			  function, line);
912}
913
914const char *ext4_decode_error(struct super_block *sb, int errno,
915			      char nbuf[16])
916{
917	char *errstr = NULL;
918
919	switch (errno) {
920	case -EFSCORRUPTED:
921		errstr = "Corrupt filesystem";
922		break;
923	case -EFSBADCRC:
924		errstr = "Filesystem failed CRC";
925		break;
926	case -EIO:
927		errstr = "IO failure";
928		break;
929	case -ENOMEM:
930		errstr = "Out of memory";
931		break;
932	case -EROFS:
933		if (!sb || (EXT4_SB(sb)->s_journal &&
934			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
935			errstr = "Journal has aborted";
936		else
937			errstr = "Readonly filesystem";
938		break;
939	default:
940		/* If the caller passed in an extra buffer for unknown
941		 * errors, textualise them now.  Else we just return
942		 * NULL. */
943		if (nbuf) {
944			/* Check for truncated error codes... */
945			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
946				errstr = nbuf;
947		}
948		break;
949	}
950
951	return errstr;
952}
953
954/* __ext4_std_error decodes expected errors from journaling functions
955 * automatically and invokes the appropriate error response.  */
956
957void __ext4_std_error(struct super_block *sb, const char *function,
958		      unsigned int line, int errno)
959{
960	char nbuf[16];
961	const char *errstr;
962
963	if (unlikely(ext4_forced_shutdown(sb)))
964		return;
965
966	/* Special case: if the error is EROFS, and we're not already
967	 * inside a transaction, then there's really no point in logging
968	 * an error. */
969	if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
970		return;
971
972	if (ext4_error_ratelimit(sb)) {
973		errstr = ext4_decode_error(sb, errno, nbuf);
974		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
975		       sb->s_id, function, line, errstr);
976	}
977	fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
978
979	ext4_handle_error(sb, false, -errno, 0, 0, function, line);
980}
981
982void __ext4_msg(struct super_block *sb,
983		const char *prefix, const char *fmt, ...)
984{
985	struct va_format vaf;
986	va_list args;
987
988	if (sb) {
989		atomic_inc(&EXT4_SB(sb)->s_msg_count);
990		if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
991				  "EXT4-fs"))
992			return;
993	}
994
995	va_start(args, fmt);
996	vaf.fmt = fmt;
997	vaf.va = &args;
998	if (sb)
999		printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
1000	else
1001		printk("%sEXT4-fs: %pV\n", prefix, &vaf);
1002	va_end(args);
1003}
1004
1005static int ext4_warning_ratelimit(struct super_block *sb)
1006{
1007	atomic_inc(&EXT4_SB(sb)->s_warning_count);
1008	return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
1009			    "EXT4-fs warning");
1010}
1011
1012void __ext4_warning(struct super_block *sb, const char *function,
1013		    unsigned int line, const char *fmt, ...)
1014{
1015	struct va_format vaf;
1016	va_list args;
1017
1018	if (!ext4_warning_ratelimit(sb))
1019		return;
1020
1021	va_start(args, fmt);
1022	vaf.fmt = fmt;
1023	vaf.va = &args;
1024	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
1025	       sb->s_id, function, line, &vaf);
1026	va_end(args);
1027}
1028
1029void __ext4_warning_inode(const struct inode *inode, const char *function,
1030			  unsigned int line, const char *fmt, ...)
1031{
1032	struct va_format vaf;
1033	va_list args;
1034
1035	if (!ext4_warning_ratelimit(inode->i_sb))
1036		return;
1037
1038	va_start(args, fmt);
1039	vaf.fmt = fmt;
1040	vaf.va = &args;
1041	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
1042	       "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
1043	       function, line, inode->i_ino, current->comm, &vaf);
1044	va_end(args);
1045}
1046
1047void __ext4_grp_locked_error(const char *function, unsigned int line,
1048			     struct super_block *sb, ext4_group_t grp,
1049			     unsigned long ino, ext4_fsblk_t block,
1050			     const char *fmt, ...)
1051__releases(bitlock)
1052__acquires(bitlock)
1053{
1054	struct va_format vaf;
1055	va_list args;
1056
1057	if (unlikely(ext4_forced_shutdown(sb)))
1058		return;
1059
1060	trace_ext4_error(sb, function, line);
1061	if (ext4_error_ratelimit(sb)) {
1062		va_start(args, fmt);
1063		vaf.fmt = fmt;
1064		vaf.va = &args;
1065		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
1066		       sb->s_id, function, line, grp);
1067		if (ino)
1068			printk(KERN_CONT "inode %lu: ", ino);
1069		if (block)
1070			printk(KERN_CONT "block %llu:",
1071			       (unsigned long long) block);
1072		printk(KERN_CONT "%pV\n", &vaf);
1073		va_end(args);
1074	}
1075
1076	if (test_opt(sb, ERRORS_CONT)) {
1077		if (test_opt(sb, WARN_ON_ERROR))
1078			WARN_ON_ONCE(1);
1079		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
1080		if (!bdev_read_only(sb->s_bdev)) {
1081			save_error_info(sb, EFSCORRUPTED, ino, block, function,
1082					line);
1083			schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
1084		}
1085		return;
1086	}
1087	ext4_unlock_group(sb, grp);
1088	ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
1089	/*
1090	 * We only get here in the ERRORS_RO case; relocking the group
1091	 * may be dangerous, but nothing bad will happen since the
1092	 * filesystem will have already been marked read/only and the
1093	 * journal has been aborted.  We return 1 as a hint to callers
1094	 * who might what to use the return value from
1095	 * ext4_grp_locked_error() to distinguish between the
1096	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
1097	 * aggressively from the ext4 function in question, with a
1098	 * more appropriate error code.
1099	 */
1100	ext4_lock_group(sb, grp);
1101	return;
1102}
1103
1104void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
1105				     ext4_group_t group,
1106				     unsigned int flags)
1107{
1108	struct ext4_sb_info *sbi = EXT4_SB(sb);
1109	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1110	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
1111	int ret;
1112
1113	if (!grp || !gdp)
1114		return;
1115	if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
1116		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1117					    &grp->bb_state);
1118		if (!ret)
1119			percpu_counter_sub(&sbi->s_freeclusters_counter,
1120					   grp->bb_free);
1121	}
1122
1123	if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
1124		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
1125					    &grp->bb_state);
1126		if (!ret && gdp) {
1127			int count;
1128
1129			count = ext4_free_inodes_count(sb, gdp);
1130			percpu_counter_sub(&sbi->s_freeinodes_counter,
1131					   count);
1132		}
1133	}
1134}
1135
1136void ext4_update_dynamic_rev(struct super_block *sb)
1137{
1138	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1139
1140	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
1141		return;
1142
1143	ext4_warning(sb,
1144		     "updating to rev %d because of new feature flag, "
1145		     "running e2fsck is recommended",
1146		     EXT4_DYNAMIC_REV);
1147
1148	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
1149	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
1150	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
1151	/* leave es->s_feature_*compat flags alone */
1152	/* es->s_uuid will be set by e2fsck if empty */
1153
1154	/*
1155	 * The rest of the superblock fields should be zero, and if not it
1156	 * means they are likely already in use, so leave them alone.  We
1157	 * can leave it up to e2fsck to clean up any inconsistencies there.
1158	 */
1159}
1160
1161static inline struct inode *orphan_list_entry(struct list_head *l)
1162{
1163	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
1164}
1165
1166static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
1167{
1168	struct list_head *l;
1169
1170	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
1171		 le32_to_cpu(sbi->s_es->s_last_orphan));
1172
1173	printk(KERN_ERR "sb_info orphan list:\n");
1174	list_for_each(l, &sbi->s_orphan) {
1175		struct inode *inode = orphan_list_entry(l);
1176		printk(KERN_ERR "  "
1177		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
1178		       inode->i_sb->s_id, inode->i_ino, inode,
1179		       inode->i_mode, inode->i_nlink,
1180		       NEXT_ORPHAN(inode));
1181	}
1182}
1183
1184#ifdef CONFIG_QUOTA
1185static int ext4_quota_off(struct super_block *sb, int type);
1186
1187static inline void ext4_quotas_off(struct super_block *sb, int type)
1188{
1189	BUG_ON(type > EXT4_MAXQUOTAS);
1190
1191	/* Use our quota_off function to clear inode flags etc. */
1192	for (type--; type >= 0; type--)
1193		ext4_quota_off(sb, type);
1194}
1195
1196/*
1197 * This is a helper function which is used in the mount/remount
1198 * codepaths (which holds s_umount) to fetch the quota file name.
1199 */
1200static inline char *get_qf_name(struct super_block *sb,
1201				struct ext4_sb_info *sbi,
1202				int type)
1203{
1204	return rcu_dereference_protected(sbi->s_qf_names[type],
1205					 lockdep_is_held(&sb->s_umount));
1206}
1207#else
1208static inline void ext4_quotas_off(struct super_block *sb, int type)
1209{
1210}
1211#endif
1212
1213static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
1214{
1215	ext4_fsblk_t block;
1216	int err;
1217
1218	block = ext4_count_free_clusters(sbi->s_sb);
1219	ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
1220	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
1221				  GFP_KERNEL);
1222	if (!err) {
1223		unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
1224		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
1225		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
1226					  GFP_KERNEL);
1227	}
1228	if (!err)
1229		err = percpu_counter_init(&sbi->s_dirs_counter,
1230					  ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
1231	if (!err)
1232		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
1233					  GFP_KERNEL);
1234	if (!err)
1235		err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
1236					  GFP_KERNEL);
1237	if (!err)
1238		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
1239
1240	if (err)
1241		ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");
1242
1243	return err;
1244}
1245
1246static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
1247{
1248	percpu_counter_destroy(&sbi->s_freeclusters_counter);
1249	percpu_counter_destroy(&sbi->s_freeinodes_counter);
1250	percpu_counter_destroy(&sbi->s_dirs_counter);
1251	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1252	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
1253	percpu_free_rwsem(&sbi->s_writepages_rwsem);
1254}
1255
1256static void ext4_group_desc_free(struct ext4_sb_info *sbi)
1257{
1258	struct buffer_head **group_desc;
1259	int i;
1260
1261	rcu_read_lock();
1262	group_desc = rcu_dereference(sbi->s_group_desc);
1263	for (i = 0; i < sbi->s_gdb_count; i++)
1264		brelse(group_desc[i]);
1265	kvfree(group_desc);
1266	rcu_read_unlock();
1267}
1268
1269static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
1270{
1271	struct flex_groups **flex_groups;
1272	int i;
1273
1274	rcu_read_lock();
1275	flex_groups = rcu_dereference(sbi->s_flex_groups);
1276	if (flex_groups) {
1277		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
1278			kvfree(flex_groups[i]);
1279		kvfree(flex_groups);
1280	}
1281	rcu_read_unlock();
1282}
1283
1284static void ext4_put_super(struct super_block *sb)
1285{
1286	struct ext4_sb_info *sbi = EXT4_SB(sb);
1287	struct ext4_super_block *es = sbi->s_es;
1288	int aborted = 0;
1289	int err;
1290
1291	/*
1292	 * Unregister sysfs before destroying jbd2 journal.
1293	 * Since we could still access attr_journal_task attribute via sysfs
1294	 * path which could have sbi->s_journal->j_task as NULL
1295	 * Unregister sysfs before flush sbi->s_sb_upd_work.
1296	 * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
1297	 * read metadata verify failed then will queue error work.
1298	 * update_super_work will call start_this_handle may trigger
1299	 * BUG_ON.
1300	 */
1301	ext4_unregister_sysfs(sb);
1302
1303	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
1304		ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
1305			 &sb->s_uuid);
1306
1307	ext4_unregister_li_request(sb);
1308	ext4_quotas_off(sb, EXT4_MAXQUOTAS);
1309
1310	flush_work(&sbi->s_sb_upd_work);
1311	destroy_workqueue(sbi->rsv_conversion_wq);
1312	ext4_release_orphan_info(sb);
1313
1314	if (sbi->s_journal) {
1315		aborted = is_journal_aborted(sbi->s_journal);
1316		err = jbd2_journal_destroy(sbi->s_journal);
1317		sbi->s_journal = NULL;
1318		if ((err < 0) && !aborted) {
1319			ext4_abort(sb, -err, "Couldn't clean up the journal");
1320		}
1321	}
1322
1323	ext4_es_unregister_shrinker(sbi);
1324	timer_shutdown_sync(&sbi->s_err_report);
1325	ext4_release_system_zone(sb);
1326	ext4_mb_release(sb);
1327	ext4_ext_release(sb);
1328
1329	if (!sb_rdonly(sb) && !aborted) {
1330		ext4_clear_feature_journal_needs_recovery(sb);
1331		ext4_clear_feature_orphan_present(sb);
1332		es->s_state = cpu_to_le16(sbi->s_mount_state);
1333	}
1334	if (!sb_rdonly(sb))
1335		ext4_commit_super(sb);
1336
1337	ext4_group_desc_free(sbi);
1338	ext4_flex_groups_free(sbi);
1339	ext4_percpu_param_destroy(sbi);
1340#ifdef CONFIG_QUOTA
1341	for (int i = 0; i < EXT4_MAXQUOTAS; i++)
1342		kfree(get_qf_name(sb, sbi, i));
1343#endif
1344
1345	/* Debugging code just in case the in-memory inode orphan list
1346	 * isn't empty.  The on-disk one can be non-empty if we've
1347	 * detected an error and taken the fs readonly, but the
1348	 * in-memory list had better be clean by this point. */
1349	if (!list_empty(&sbi->s_orphan))
1350		dump_orphan_list(sb, sbi);
1351	ASSERT(list_empty(&sbi->s_orphan));
1352
1353	sync_blockdev(sb->s_bdev);
1354	invalidate_bdev(sb->s_bdev);
1355	if (sbi->s_journal_bdev) {
1356		/*
1357		 * Invalidate the journal device's buffers.  We don't want them
1358		 * floating about in memory - the physical journal device may
1359		 * hotswapped, and it breaks the `ro-after' testing code.
1360		 */
1361		sync_blockdev(sbi->s_journal_bdev);
1362		invalidate_bdev(sbi->s_journal_bdev);
1363	}
1364
1365	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1366	sbi->s_ea_inode_cache = NULL;
1367
1368	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1369	sbi->s_ea_block_cache = NULL;
1370
1371	ext4_stop_mmpd(sbi);
1372
1373	brelse(sbi->s_sbh);
1374	sb->s_fs_info = NULL;
1375	/*
1376	 * Now that we are completely done shutting down the
1377	 * superblock, we need to actually destroy the kobject.
1378	 */
1379	kobject_put(&sbi->s_kobj);
1380	wait_for_completion(&sbi->s_kobj_unregister);
1381	if (sbi->s_chksum_driver)
1382		crypto_free_shash(sbi->s_chksum_driver);
1383	kfree(sbi->s_blockgroup_lock);
1384	fs_put_dax(sbi->s_daxdev, NULL);
1385	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
1386#if IS_ENABLED(CONFIG_UNICODE)
1387	utf8_unload(sb->s_encoding);
1388#endif
1389	kfree(sbi);
1390}
1391
1392static struct kmem_cache *ext4_inode_cachep;
1393
1394/*
1395 * Called inside transaction, so use GFP_NOFS
1396 */
1397static struct inode *ext4_alloc_inode(struct super_block *sb)
1398{
1399	struct ext4_inode_info *ei;
1400
1401	ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
1402	if (!ei)
1403		return NULL;
1404
1405	inode_set_iversion(&ei->vfs_inode, 1);
1406	ei->i_flags = 0;
1407	spin_lock_init(&ei->i_raw_lock);
1408	ei->i_prealloc_node = RB_ROOT;
1409	atomic_set(&ei->i_prealloc_active, 0);
1410	rwlock_init(&ei->i_prealloc_lock);
1411	ext4_es_init_tree(&ei->i_es_tree);
1412	rwlock_init(&ei->i_es_lock);
1413	INIT_LIST_HEAD(&ei->i_es_list);
1414	ei->i_es_all_nr = 0;
1415	ei->i_es_shk_nr = 0;
1416	ei->i_es_shrink_lblk = 0;
1417	ei->i_reserved_data_blocks = 0;
1418	spin_lock_init(&(ei->i_block_reservation_lock));
1419	ext4_init_pending_tree(&ei->i_pending_tree);
1420#ifdef CONFIG_QUOTA
1421	ei->i_reserved_quota = 0;
1422	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1423#endif
1424	ei->jinode = NULL;
1425	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1426	spin_lock_init(&ei->i_completed_io_lock);
1427	ei->i_sync_tid = 0;
1428	ei->i_datasync_tid = 0;
1429	atomic_set(&ei->i_unwritten, 0);
1430	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1431	ext4_fc_init_inode(&ei->vfs_inode);
1432	mutex_init(&ei->i_fc_lock);
1433	return &ei->vfs_inode;
1434}
1435
1436static int ext4_drop_inode(struct inode *inode)
1437{
1438	int drop = generic_drop_inode(inode);
1439
1440	if (!drop)
1441		drop = fscrypt_drop_inode(inode);
1442
1443	trace_ext4_drop_inode(inode, drop);
1444	return drop;
1445}
1446
1447static void ext4_free_in_core_inode(struct inode *inode)
1448{
1449	fscrypt_free_inode(inode);
1450	if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
1451		pr_warn("%s: inode %ld still in fc list",
1452			__func__, inode->i_ino);
1453	}
1454	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1455}
1456
1457static void ext4_destroy_inode(struct inode *inode)
1458{
1459	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
1460		ext4_msg(inode->i_sb, KERN_ERR,
1461			 "Inode %lu (%p): orphan list check failed!",
1462			 inode->i_ino, EXT4_I(inode));
1463		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1464				EXT4_I(inode), sizeof(struct ext4_inode_info),
1465				true);
1466		dump_stack();
1467	}
1468
1469	if (EXT4_I(inode)->i_reserved_data_blocks)
1470		ext4_msg(inode->i_sb, KERN_ERR,
1471			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
1472			 inode->i_ino, EXT4_I(inode),
1473			 EXT4_I(inode)->i_reserved_data_blocks);
1474}
1475
1476static void ext4_shutdown(struct super_block *sb)
1477{
1478       ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
1479}
1480
1481static void init_once(void *foo)
1482{
1483	struct ext4_inode_info *ei = foo;
1484
1485	INIT_LIST_HEAD(&ei->i_orphan);
1486	init_rwsem(&ei->xattr_sem);
1487	init_rwsem(&ei->i_data_sem);
1488	inode_init_once(&ei->vfs_inode);
1489	ext4_fc_init_inode(&ei->vfs_inode);
1490}
1491
1492static int __init init_inodecache(void)
1493{
1494	ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
1495				sizeof(struct ext4_inode_info), 0,
1496				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
1497					SLAB_ACCOUNT),
1498				offsetof(struct ext4_inode_info, i_data),
1499				sizeof_field(struct ext4_inode_info, i_data),
1500				init_once);
1501	if (ext4_inode_cachep == NULL)
1502		return -ENOMEM;
1503	return 0;
1504}
1505
1506static void destroy_inodecache(void)
1507{
1508	/*
1509	 * Make sure all delayed rcu free inodes are flushed before we
1510	 * destroy cache.
1511	 */
1512	rcu_barrier();
1513	kmem_cache_destroy(ext4_inode_cachep);
1514}
1515
1516void ext4_clear_inode(struct inode *inode)
1517{
1518	ext4_fc_del(inode);
1519	invalidate_inode_buffers(inode);
1520	clear_inode(inode);
1521	ext4_discard_preallocations(inode, 0);
1522	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1523	dquot_drop(inode);
1524	if (EXT4_I(inode)->jinode) {
1525		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1526					       EXT4_I(inode)->jinode);
1527		jbd2_free_inode(EXT4_I(inode)->jinode);
1528		EXT4_I(inode)->jinode = NULL;
1529	}
1530	fscrypt_put_encryption_info(inode);
1531	fsverity_cleanup_inode(inode);
1532}
1533
1534static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1535					u64 ino, u32 generation)
1536{
1537	struct inode *inode;
1538
1539	/*
1540	 * Currently we don't know the generation for parent directory, so
1541	 * a generation of 0 means "accept any"
1542	 */
1543	inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1544	if (IS_ERR(inode))
1545		return ERR_CAST(inode);
1546	if (generation && inode->i_generation != generation) {
1547		iput(inode);
1548		return ERR_PTR(-ESTALE);
1549	}
1550
1551	return inode;
1552}
1553
1554static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1555					int fh_len, int fh_type)
1556{
1557	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1558				    ext4_nfs_get_inode);
1559}
1560
1561static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1562					int fh_len, int fh_type)
1563{
1564	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1565				    ext4_nfs_get_inode);
1566}
1567
1568static int ext4_nfs_commit_metadata(struct inode *inode)
1569{
1570	struct writeback_control wbc = {
1571		.sync_mode = WB_SYNC_ALL
1572	};
1573
1574	trace_ext4_nfs_commit_metadata(inode);
1575	return ext4_write_inode(inode, &wbc);
1576}
1577
1578#ifdef CONFIG_QUOTA
1579static const char * const quotatypes[] = INITQFNAMES;
1580#define QTYPE2NAME(t) (quotatypes[t])
1581
1582static int ext4_write_dquot(struct dquot *dquot);
1583static int ext4_acquire_dquot(struct dquot *dquot);
1584static int ext4_release_dquot(struct dquot *dquot);
1585static int ext4_mark_dquot_dirty(struct dquot *dquot);
1586static int ext4_write_info(struct super_block *sb, int type);
1587static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1588			 const struct path *path);
1589static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1590			       size_t len, loff_t off);
1591static ssize_t ext4_quota_write(struct super_block *sb, int type,
1592				const char *data, size_t len, loff_t off);
1593static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1594			     unsigned int flags);
1595
1596static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
1597{
1598	return EXT4_I(inode)->i_dquot;
1599}
1600
1601static const struct dquot_operations ext4_quota_operations = {
1602	.get_reserved_space	= ext4_get_reserved_space,
1603	.write_dquot		= ext4_write_dquot,
1604	.acquire_dquot		= ext4_acquire_dquot,
1605	.release_dquot		= ext4_release_dquot,
1606	.mark_dirty		= ext4_mark_dquot_dirty,
1607	.write_info		= ext4_write_info,
1608	.alloc_dquot		= dquot_alloc,
1609	.destroy_dquot		= dquot_destroy,
1610	.get_projid		= ext4_get_projid,
1611	.get_inode_usage	= ext4_get_inode_usage,
1612	.get_next_id		= dquot_get_next_id,
1613};
1614
1615static const struct quotactl_ops ext4_qctl_operations = {
1616	.quota_on	= ext4_quota_on,
1617	.quota_off	= ext4_quota_off,
1618	.quota_sync	= dquot_quota_sync,
1619	.get_state	= dquot_get_state,
1620	.set_info	= dquot_set_dqinfo,
1621	.get_dqblk	= dquot_get_dqblk,
1622	.set_dqblk	= dquot_set_dqblk,
1623	.get_nextdqblk	= dquot_get_next_dqblk,
1624};
1625#endif
1626
1627static const struct super_operations ext4_sops = {
1628	.alloc_inode	= ext4_alloc_inode,
1629	.free_inode	= ext4_free_in_core_inode,
1630	.destroy_inode	= ext4_destroy_inode,
1631	.write_inode	= ext4_write_inode,
1632	.dirty_inode	= ext4_dirty_inode,
1633	.drop_inode	= ext4_drop_inode,
1634	.evict_inode	= ext4_evict_inode,
1635	.put_super	= ext4_put_super,
1636	.sync_fs	= ext4_sync_fs,
1637	.freeze_fs	= ext4_freeze,
1638	.unfreeze_fs	= ext4_unfreeze,
1639	.statfs		= ext4_statfs,
1640	.show_options	= ext4_show_options,
1641	.shutdown	= ext4_shutdown,
1642#ifdef CONFIG_QUOTA
1643	.quota_read	= ext4_quota_read,
1644	.quota_write	= ext4_quota_write,
1645	.get_dquots	= ext4_get_dquots,
1646#endif
1647};
1648
1649static const struct export_operations ext4_export_ops = {
1650	.fh_to_dentry = ext4_fh_to_dentry,
1651	.fh_to_parent = ext4_fh_to_parent,
1652	.get_parent = ext4_get_parent,
1653	.commit_metadata = ext4_nfs_commit_metadata,
1654};
1655
1656enum {
1657	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1658	Opt_resgid, Opt_resuid, Opt_sb,
1659	Opt_nouid32, Opt_debug, Opt_removed,
1660	Opt_user_xattr, Opt_acl,
1661	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1662	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1663	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1664	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1665	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1666	Opt_inlinecrypt,
1667	Opt_usrjquota, Opt_grpjquota, Opt_quota,
1668	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1669	Opt_usrquota, Opt_grpquota, Opt_prjquota,
1670	Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
1671	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1672	Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
1673	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1674	Opt_inode_readahead_blks, Opt_journal_ioprio,
1675	Opt_dioread_nolock, Opt_dioread_lock,
1676	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1677	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1678	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
1679	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
1680#ifdef CONFIG_EXT4_DEBUG
1681	Opt_fc_debug_max_replay, Opt_fc_debug_force
1682#endif
1683};
1684
1685static const struct constant_table ext4_param_errors[] = {
1686	{"continue",	EXT4_MOUNT_ERRORS_CONT},
1687	{"panic",	EXT4_MOUNT_ERRORS_PANIC},
1688	{"remount-ro",	EXT4_MOUNT_ERRORS_RO},
1689	{}
1690};
1691
1692static const struct constant_table ext4_param_data[] = {
1693	{"journal",	EXT4_MOUNT_JOURNAL_DATA},
1694	{"ordered",	EXT4_MOUNT_ORDERED_DATA},
1695	{"writeback",	EXT4_MOUNT_WRITEBACK_DATA},
1696	{}
1697};
1698
1699static const struct constant_table ext4_param_data_err[] = {
1700	{"abort",	Opt_data_err_abort},
1701	{"ignore",	Opt_data_err_ignore},
1702	{}
1703};
1704
1705static const struct constant_table ext4_param_jqfmt[] = {
1706	{"vfsold",	QFMT_VFS_OLD},
1707	{"vfsv0",	QFMT_VFS_V0},
1708	{"vfsv1",	QFMT_VFS_V1},
1709	{}
1710};
1711
1712static const struct constant_table ext4_param_dax[] = {
1713	{"always",	Opt_dax_always},
1714	{"inode",	Opt_dax_inode},
1715	{"never",	Opt_dax_never},
1716	{}
1717};
1718
1719/* String parameter that allows empty argument */
1720#define fsparam_string_empty(NAME, OPT) \
1721	__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
1722
1723/*
1724 * Mount option specification
1725 * We don't use fsparam_flag_no because of the way we set the
1726 * options and the way we show them in _ext4_show_options(). To
1727 * keep the changes to a minimum, let's keep the negative options
1728 * separate for now.
1729 */
1730static const struct fs_parameter_spec ext4_param_specs[] = {
1731	fsparam_flag	("bsddf",		Opt_bsd_df),
1732	fsparam_flag	("minixdf",		Opt_minix_df),
1733	fsparam_flag	("grpid",		Opt_grpid),
1734	fsparam_flag	("bsdgroups",		Opt_grpid),
1735	fsparam_flag	("nogrpid",		Opt_nogrpid),
1736	fsparam_flag	("sysvgroups",		Opt_nogrpid),
1737	fsparam_u32	("resgid",		Opt_resgid),
1738	fsparam_u32	("resuid",		Opt_resuid),
1739	fsparam_u32	("sb",			Opt_sb),
1740	fsparam_enum	("errors",		Opt_errors, ext4_param_errors),
1741	fsparam_flag	("nouid32",		Opt_nouid32),
1742	fsparam_flag	("debug",		Opt_debug),
1743	fsparam_flag	("oldalloc",		Opt_removed),
1744	fsparam_flag	("orlov",		Opt_removed),
1745	fsparam_flag	("user_xattr",		Opt_user_xattr),
1746	fsparam_flag	("acl",			Opt_acl),
1747	fsparam_flag	("norecovery",		Opt_noload),
1748	fsparam_flag	("noload",		Opt_noload),
1749	fsparam_flag	("bh",			Opt_removed),
1750	fsparam_flag	("nobh",		Opt_removed),
1751	fsparam_u32	("commit",		Opt_commit),
1752	fsparam_u32	("min_batch_time",	Opt_min_batch_time),
1753	fsparam_u32	("max_batch_time",	Opt_max_batch_time),
1754	fsparam_u32	("journal_dev",		Opt_journal_dev),
1755	fsparam_bdev	("journal_path",	Opt_journal_path),
1756	fsparam_flag	("journal_checksum",	Opt_journal_checksum),
1757	fsparam_flag	("nojournal_checksum",	Opt_nojournal_checksum),
1758	fsparam_flag	("journal_async_commit",Opt_journal_async_commit),
1759	fsparam_flag	("abort",		Opt_abort),
1760	fsparam_enum	("data",		Opt_data, ext4_param_data),
1761	fsparam_enum	("data_err",		Opt_data_err,
1762						ext4_param_data_err),
1763	fsparam_string_empty
1764			("usrjquota",		Opt_usrjquota),
1765	fsparam_string_empty
1766			("grpjquota",		Opt_grpjquota),
1767	fsparam_enum	("jqfmt",		Opt_jqfmt, ext4_param_jqfmt),
1768	fsparam_flag	("grpquota",		Opt_grpquota),
1769	fsparam_flag	("quota",		Opt_quota),
1770	fsparam_flag	("noquota",		Opt_noquota),
1771	fsparam_flag	("usrquota",		Opt_usrquota),
1772	fsparam_flag	("prjquota",		Opt_prjquota),
1773	fsparam_flag	("barrier",		Opt_barrier),
1774	fsparam_u32	("barrier",		Opt_barrier),
1775	fsparam_flag	("nobarrier",		Opt_nobarrier),
1776	fsparam_flag	("i_version",		Opt_removed),
1777	fsparam_flag	("dax",			Opt_dax),
1778	fsparam_enum	("dax",			Opt_dax_type, ext4_param_dax),
1779	fsparam_u32	("stripe",		Opt_stripe),
1780	fsparam_flag	("delalloc",		Opt_delalloc),
1781	fsparam_flag	("nodelalloc",		Opt_nodelalloc),
1782	fsparam_flag	("warn_on_error",	Opt_warn_on_error),
1783	fsparam_flag	("nowarn_on_error",	Opt_nowarn_on_error),
1784	fsparam_u32	("debug_want_extra_isize",
1785						Opt_debug_want_extra_isize),
1786	fsparam_flag	("mblk_io_submit",	Opt_removed),
1787	fsparam_flag	("nomblk_io_submit",	Opt_removed),
1788	fsparam_flag	("block_validity",	Opt_block_validity),
1789	fsparam_flag	("noblock_validity",	Opt_noblock_validity),
1790	fsparam_u32	("inode_readahead_blks",
1791						Opt_inode_readahead_blks),
1792	fsparam_u32	("journal_ioprio",	Opt_journal_ioprio),
1793	fsparam_u32	("auto_da_alloc",	Opt_auto_da_alloc),
1794	fsparam_flag	("auto_da_alloc",	Opt_auto_da_alloc),
1795	fsparam_flag	("noauto_da_alloc",	Opt_noauto_da_alloc),
1796	fsparam_flag	("dioread_nolock",	Opt_dioread_nolock),
1797	fsparam_flag	("nodioread_nolock",	Opt_dioread_lock),
1798	fsparam_flag	("dioread_lock",	Opt_dioread_lock),
1799	fsparam_flag	("discard",		Opt_discard),
1800	fsparam_flag	("nodiscard",		Opt_nodiscard),
1801	fsparam_u32	("init_itable",		Opt_init_itable),
1802	fsparam_flag	("init_itable",		Opt_init_itable),
1803	fsparam_flag	("noinit_itable",	Opt_noinit_itable),
1804#ifdef CONFIG_EXT4_DEBUG
1805	fsparam_flag	("fc_debug_force",	Opt_fc_debug_force),
1806	fsparam_u32	("fc_debug_max_replay",	Opt_fc_debug_max_replay),
1807#endif
1808	fsparam_u32	("max_dir_size_kb",	Opt_max_dir_size_kb),
1809	fsparam_flag	("test_dummy_encryption",
1810						Opt_test_dummy_encryption),
1811	fsparam_string	("test_dummy_encryption",
1812						Opt_test_dummy_encryption),
1813	fsparam_flag	("inlinecrypt",		Opt_inlinecrypt),
1814	fsparam_flag	("nombcache",		Opt_nombcache),
1815	fsparam_flag	("no_mbcache",		Opt_nombcache),	/* for backward compatibility */
1816	fsparam_flag	("prefetch_block_bitmaps",
1817						Opt_removed),
1818	fsparam_flag	("no_prefetch_block_bitmaps",
1819						Opt_no_prefetch_block_bitmaps),
1820	fsparam_s32	("mb_optimize_scan",	Opt_mb_optimize_scan),
1821	fsparam_string	("check",		Opt_removed),	/* mount option from ext2/3 */
1822	fsparam_flag	("nocheck",		Opt_removed),	/* mount option from ext2/3 */
1823	fsparam_flag	("reservation",		Opt_removed),	/* mount option from ext2/3 */
1824	fsparam_flag	("noreservation",	Opt_removed),	/* mount option from ext2/3 */
1825	fsparam_u32	("journal",		Opt_removed),	/* mount option from ext2/3 */
1826	{}
1827};
1828
1829#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1830
1831#define MOPT_SET	0x0001
1832#define MOPT_CLEAR	0x0002
1833#define MOPT_NOSUPPORT	0x0004
1834#define MOPT_EXPLICIT	0x0008
1835#ifdef CONFIG_QUOTA
1836#define MOPT_Q		0
1837#define MOPT_QFMT	0x0010
1838#else
1839#define MOPT_Q		MOPT_NOSUPPORT
1840#define MOPT_QFMT	MOPT_NOSUPPORT
1841#endif
1842#define MOPT_NO_EXT2	0x0020
1843#define MOPT_NO_EXT3	0x0040
1844#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
1845#define MOPT_SKIP	0x0080
1846#define	MOPT_2		0x0100
1847
1848static const struct mount_opts {
1849	int	token;
1850	int	mount_opt;
1851	int	flags;
1852} ext4_mount_opts[] = {
1853	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1854	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1855	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1856	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1857	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1858	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1859	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1860	 MOPT_EXT4_ONLY | MOPT_SET},
1861	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1862	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1863	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1864	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1865	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
1866	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1867	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1868	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1869	{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1870	{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1871	{Opt_commit, 0, MOPT_NO_EXT2},
1872	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1873	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1874	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1875	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1876	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1877				    EXT4_MOUNT_JOURNAL_CHECKSUM),
1878	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1879	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1880	{Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
1881	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1882	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1883	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1884	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1885	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1886	{Opt_dax_type, 0, MOPT_EXT4_ONLY},
1887	{Opt_journal_dev, 0, MOPT_NO_EXT2},
1888	{Opt_journal_path, 0, MOPT_NO_EXT2},
1889	{Opt_journal_ioprio, 0, MOPT_NO_EXT2},
1890	{Opt_data, 0, MOPT_NO_EXT2},
1891	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1892#ifdef CONFIG_EXT4_FS_POSIX_ACL
1893	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1894#else
1895	{Opt_acl, 0, MOPT_NOSUPPORT},
1896#endif
1897	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1898	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1899	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1900	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1901							MOPT_SET | MOPT_Q},
1902	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1903							MOPT_SET | MOPT_Q},
1904	{Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1905							MOPT_SET | MOPT_Q},
1906	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1907		       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1908							MOPT_CLEAR | MOPT_Q},
1909	{Opt_usrjquota, 0, MOPT_Q},
1910	{Opt_grpjquota, 0, MOPT_Q},
1911	{Opt_jqfmt, 0, MOPT_QFMT},
1912	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
1913	{Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
1914	 MOPT_SET},
1915#ifdef CONFIG_EXT4_DEBUG
1916	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
1917	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
1918#endif
1919	{Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
1920	{Opt_err, 0, 0}
1921};
1922
1923#if IS_ENABLED(CONFIG_UNICODE)
1924static const struct ext4_sb_encodings {
1925	__u16 magic;
1926	char *name;
1927	unsigned int version;
1928} ext4_sb_encoding_map[] = {
1929	{EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
1930};
1931
1932static const struct ext4_sb_encodings *
1933ext4_sb_read_encoding(const struct ext4_super_block *es)
1934{
1935	__u16 magic = le16_to_cpu(es->s_encoding);
1936	int i;
1937
1938	for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
1939		if (magic == ext4_sb_encoding_map[i].magic)
1940			return &ext4_sb_encoding_map[i];
1941
1942	return NULL;
1943}
1944#endif
1945
1946#define EXT4_SPEC_JQUOTA			(1 <<  0)
1947#define EXT4_SPEC_JQFMT				(1 <<  1)
1948#define EXT4_SPEC_DATAJ				(1 <<  2)
1949#define EXT4_SPEC_SB_BLOCK			(1 <<  3)
1950#define EXT4_SPEC_JOURNAL_DEV			(1 <<  4)
1951#define EXT4_SPEC_JOURNAL_IOPRIO		(1 <<  5)
1952#define EXT4_SPEC_s_want_extra_isize		(1 <<  7)
1953#define EXT4_SPEC_s_max_batch_time		(1 <<  8)
1954#define EXT4_SPEC_s_min_batch_time		(1 <<  9)
1955#define EXT4_SPEC_s_inode_readahead_blks	(1 << 10)
1956#define EXT4_SPEC_s_li_wait_mult		(1 << 11)
1957#define EXT4_SPEC_s_max_dir_size_kb		(1 << 12)
1958#define EXT4_SPEC_s_stripe			(1 << 13)
1959#define EXT4_SPEC_s_resuid			(1 << 14)
1960#define EXT4_SPEC_s_resgid			(1 << 15)
1961#define EXT4_SPEC_s_commit_interval		(1 << 16)
1962#define EXT4_SPEC_s_fc_debug_max_replay		(1 << 17)
1963#define EXT4_SPEC_s_sb_block			(1 << 18)
1964#define EXT4_SPEC_mb_optimize_scan		(1 << 19)
1965
1966struct ext4_fs_context {
1967	char		*s_qf_names[EXT4_MAXQUOTAS];
1968	struct fscrypt_dummy_policy dummy_enc_policy;
1969	int		s_jquota_fmt;	/* Format of quota to use */
1970#ifdef CONFIG_EXT4_DEBUG
1971	int s_fc_debug_max_replay;
1972#endif
1973	unsigned short	qname_spec;
1974	unsigned long	vals_s_flags;	/* Bits to set in s_flags */
1975	unsigned long	mask_s_flags;	/* Bits changed in s_flags */
1976	unsigned long	journal_devnum;
1977	unsigned long	s_commit_interval;
1978	unsigned long	s_stripe;
1979	unsigned int	s_inode_readahead_blks;
1980	unsigned int	s_want_extra_isize;
1981	unsigned int	s_li_wait_mult;
1982	unsigned int	s_max_dir_size_kb;
1983	unsigned int	journal_ioprio;
1984	unsigned int	vals_s_mount_opt;
1985	unsigned int	mask_s_mount_opt;
1986	unsigned int	vals_s_mount_opt2;
1987	unsigned int	mask_s_mount_opt2;
1988	unsigned int	opt_flags;	/* MOPT flags */
1989	unsigned int	spec;
1990	u32		s_max_batch_time;
1991	u32		s_min_batch_time;
1992	kuid_t		s_resuid;
1993	kgid_t		s_resgid;
1994	ext4_fsblk_t	s_sb_block;
1995};
1996
1997static void ext4_fc_free(struct fs_context *fc)
1998{
1999	struct ext4_fs_context *ctx = fc->fs_private;
2000	int i;
2001
2002	if (!ctx)
2003		return;
2004
2005	for (i = 0; i < EXT4_MAXQUOTAS; i++)
2006		kfree(ctx->s_qf_names[i]);
2007
2008	fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
2009	kfree(ctx);
2010}
2011
2012int ext4_init_fs_context(struct fs_context *fc)
2013{
2014	struct ext4_fs_context *ctx;
2015
2016	ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
2017	if (!ctx)
2018		return -ENOMEM;
2019
2020	fc->fs_private = ctx;
2021	fc->ops = &ext4_context_ops;
2022
2023	return 0;
2024}
2025
2026#ifdef CONFIG_QUOTA
2027/*
2028 * Note the name of the specified quota file.
2029 */
2030static int note_qf_name(struct fs_context *fc, int qtype,
2031		       struct fs_parameter *param)
2032{
2033	struct ext4_fs_context *ctx = fc->fs_private;
2034	char *qname;
2035
2036	if (param->size < 1) {
2037		ext4_msg(NULL, KERN_ERR, "Missing quota name");
2038		return -EINVAL;
2039	}
2040	if (strchr(param->string, '/')) {
2041		ext4_msg(NULL, KERN_ERR,
2042			 "quotafile must be on filesystem root");
2043		return -EINVAL;
2044	}
2045	if (ctx->s_qf_names[qtype]) {
2046		if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
2047			ext4_msg(NULL, KERN_ERR,
2048				 "%s quota file already specified",
2049				 QTYPE2NAME(qtype));
2050			return -EINVAL;
2051		}
2052		return 0;
2053	}
2054
2055	qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
2056	if (!qname) {
2057		ext4_msg(NULL, KERN_ERR,
2058			 "Not enough memory for storing quotafile name");
2059		return -ENOMEM;
2060	}
2061	ctx->s_qf_names[qtype] = qname;
2062	ctx->qname_spec |= 1 << qtype;
2063	ctx->spec |= EXT4_SPEC_JQUOTA;
2064	return 0;
2065}
2066
2067/*
2068 * Clear the name of the specified quota file.
2069 */
2070static int unnote_qf_name(struct fs_context *fc, int qtype)
2071{
2072	struct ext4_fs_context *ctx = fc->fs_private;
2073
2074	if (ctx->s_qf_names[qtype])
2075		kfree(ctx->s_qf_names[qtype]);
2076
2077	ctx->s_qf_names[qtype] = NULL;
2078	ctx->qname_spec |= 1 << qtype;
2079	ctx->spec |= EXT4_SPEC_JQUOTA;
2080	return 0;
2081}
2082#endif
2083
2084static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
2085					    struct ext4_fs_context *ctx)
2086{
2087	int err;
2088
2089	if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
2090		ext4_msg(NULL, KERN_WARNING,
2091			 "test_dummy_encryption option not supported");
2092		return -EINVAL;
2093	}
2094	err = fscrypt_parse_test_dummy_encryption(param,
2095						  &ctx->dummy_enc_policy);
2096	if (err == -EINVAL) {
2097		ext4_msg(NULL, KERN_WARNING,
2098			 "Value of option \"%s\" is unrecognized", param->key);
2099	} else if (err == -EEXIST) {
2100		ext4_msg(NULL, KERN_WARNING,
2101			 "Conflicting test_dummy_encryption options");
2102		return -EINVAL;
2103	}
2104	return err;
2105}
2106
2107#define EXT4_SET_CTX(name)						\
2108static inline void ctx_set_##name(struct ext4_fs_context *ctx,		\
2109				  unsigned long flag)			\
2110{									\
2111	ctx->mask_s_##name |= flag;					\
2112	ctx->vals_s_##name |= flag;					\
2113}
2114
2115#define EXT4_CLEAR_CTX(name)						\
2116static inline void ctx_clear_##name(struct ext4_fs_context *ctx,	\
2117				    unsigned long flag)			\
2118{									\
2119	ctx->mask_s_##name |= flag;					\
2120	ctx->vals_s_##name &= ~flag;					\
2121}
2122
2123#define EXT4_TEST_CTX(name)						\
2124static inline unsigned long						\
2125ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag)	\
2126{									\
2127	return (ctx->vals_s_##name & flag);				\
2128}
2129
2130EXT4_SET_CTX(flags); /* set only */
2131EXT4_SET_CTX(mount_opt);
2132EXT4_CLEAR_CTX(mount_opt);
2133EXT4_TEST_CTX(mount_opt);
2134EXT4_SET_CTX(mount_opt2);
2135EXT4_CLEAR_CTX(mount_opt2);
2136EXT4_TEST_CTX(mount_opt2);
2137
2138static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
2139{
2140	struct ext4_fs_context *ctx = fc->fs_private;
2141	struct fs_parse_result result;
2142	const struct mount_opts *m;
2143	int is_remount;
2144	kuid_t uid;
2145	kgid_t gid;
2146	int token;
2147
2148	token = fs_parse(fc, ext4_param_specs, param, &result);
2149	if (token < 0)
2150		return token;
2151	is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
2152
2153	for (m = ext4_mount_opts; m->token != Opt_err; m++)
2154		if (token == m->token)
2155			break;
2156
2157	ctx->opt_flags |= m->flags;
2158
2159	if (m->flags & MOPT_EXPLICIT) {
2160		if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
2161			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
2162		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
2163			ctx_set_mount_opt2(ctx,
2164				       EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
2165		} else
2166			return -EINVAL;
2167	}
2168
2169	if (m->flags & MOPT_NOSUPPORT) {
2170		ext4_msg(NULL, KERN_ERR, "%s option not supported",
2171			 param->key);
2172		return 0;
2173	}
2174
2175	switch (token) {
2176#ifdef CONFIG_QUOTA
2177	case Opt_usrjquota:
2178		if (!*param->string)
2179			return unnote_qf_name(fc, USRQUOTA);
2180		else
2181			return note_qf_name(fc, USRQUOTA, param);
2182	case Opt_grpjquota:
2183		if (!*param->string)
2184			return unnote_qf_name(fc, GRPQUOTA);
2185		else
2186			return note_qf_name(fc, GRPQUOTA, param);
2187#endif
2188	case Opt_sb:
2189		if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
2190			ext4_msg(NULL, KERN_WARNING,
2191				 "Ignoring %s option on remount", param->key);
2192		} else {
2193			ctx->s_sb_block = result.uint_32;
2194			ctx->spec |= EXT4_SPEC_s_sb_block;
2195		}
2196		return 0;
2197	case Opt_removed:
2198		ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
2199			 param->key);
2200		return 0;
2201	case Opt_inlinecrypt:
2202#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
2203		ctx_set_flags(ctx, SB_INLINECRYPT);
2204#else
2205		ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
2206#endif
2207		return 0;
2208	case Opt_errors:
2209		ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
2210		ctx_set_mount_opt(ctx, result.uint_32);
2211		return 0;
2212#ifdef CONFIG_QUOTA
2213	case Opt_jqfmt:
2214		ctx->s_jquota_fmt = result.uint_32;
2215		ctx->spec |= EXT4_SPEC_JQFMT;
2216		return 0;
2217#endif
2218	case Opt_data:
2219		ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
2220		ctx_set_mount_opt(ctx, result.uint_32);
2221		ctx->spec |= EXT4_SPEC_DATAJ;
2222		return 0;
2223	case Opt_commit:
2224		if (result.uint_32 == 0)
2225			result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
2226		else if (result.uint_32 > INT_MAX / HZ) {
2227			ext4_msg(NULL, KERN_ERR,
2228				 "Invalid commit interval %d, "
2229				 "must be smaller than %d",
2230				 result.uint_32, INT_MAX / HZ);
2231			return -EINVAL;
2232		}
2233		ctx->s_commit_interval = HZ * result.uint_32;
2234		ctx->spec |= EXT4_SPEC_s_commit_interval;
2235		return 0;
2236	case Opt_debug_want_extra_isize:
2237		if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
2238			ext4_msg(NULL, KERN_ERR,
2239				 "Invalid want_extra_isize %d", result.uint_32);
2240			return -EINVAL;
2241		}
2242		ctx->s_want_extra_isize = result.uint_32;
2243		ctx->spec |= EXT4_SPEC_s_want_extra_isize;
2244		return 0;
2245	case Opt_max_batch_time:
2246		ctx->s_max_batch_time = result.uint_32;
2247		ctx->spec |= EXT4_SPEC_s_max_batch_time;
2248		return 0;
2249	case Opt_min_batch_time:
2250		ctx->s_min_batch_time = result.uint_32;
2251		ctx->spec |= EXT4_SPEC_s_min_batch_time;
2252		return 0;
2253	case Opt_inode_readahead_blks:
2254		if (result.uint_32 &&
2255		    (result.uint_32 > (1 << 30) ||
2256		     !is_power_of_2(result.uint_32))) {
2257			ext4_msg(NULL, KERN_ERR,
2258				 "EXT4-fs: inode_readahead_blks must be "
2259				 "0 or a power of 2 smaller than 2^31");
2260			return -EINVAL;
2261		}
2262		ctx->s_inode_readahead_blks = result.uint_32;
2263		ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
2264		return 0;
2265	case Opt_init_itable:
2266		ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
2267		ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
2268		if (param->type == fs_value_is_string)
2269			ctx->s_li_wait_mult = result.uint_32;
2270		ctx->spec |= EXT4_SPEC_s_li_wait_mult;
2271		return 0;
2272	case Opt_max_dir_size_kb:
2273		ctx->s_max_dir_size_kb = result.uint_32;
2274		ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
2275		return 0;
2276#ifdef CONFIG_EXT4_DEBUG
2277	case Opt_fc_debug_max_replay:
2278		ctx->s_fc_debug_max_replay = result.uint_32;
2279		ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
2280		return 0;
2281#endif
2282	case Opt_stripe:
2283		ctx->s_stripe = result.uint_32;
2284		ctx->spec |= EXT4_SPEC_s_stripe;
2285		return 0;
2286	case Opt_resuid:
2287		uid = make_kuid(current_user_ns(), result.uint_32);
2288		if (!uid_valid(uid)) {
2289			ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
2290				 result.uint_32);
2291			return -EINVAL;
2292		}
2293		ctx->s_resuid = uid;
2294		ctx->spec |= EXT4_SPEC_s_resuid;
2295		return 0;
2296	case Opt_resgid:
2297		gid = make_kgid(current_user_ns(), result.uint_32);
2298		if (!gid_valid(gid)) {
2299			ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
2300				 result.uint_32);
2301			return -EINVAL;
2302		}
2303		ctx->s_resgid = gid;
2304		ctx->spec |= EXT4_SPEC_s_resgid;
2305		return 0;
2306	case Opt_journal_dev:
2307		if (is_remount) {
2308			ext4_msg(NULL, KERN_ERR,
2309				 "Cannot specify journal on remount");
2310			return -EINVAL;
2311		}
2312		ctx->journal_devnum = result.uint_32;
2313		ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
2314		return 0;
2315	case Opt_journal_path:
2316	{
2317		struct inode *journal_inode;
2318		struct path path;
2319		int error;
2320
2321		if (is_remount) {
2322			ext4_msg(NULL, KERN_ERR,
2323				 "Cannot specify journal on remount");
2324			return -EINVAL;
2325		}
2326
2327		error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
2328		if (error) {
2329			ext4_msg(NULL, KERN_ERR, "error: could not find "
2330				 "journal device path");
2331			return -EINVAL;
2332		}
2333
2334		journal_inode = d_inode(path.dentry);
2335		ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
2336		ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
2337		path_put(&path);
2338		return 0;
2339	}
2340	case Opt_journal_ioprio:
2341		if (result.uint_32 > 7) {
2342			ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
2343				 " (must be 0-7)");
2344			return -EINVAL;
2345		}
2346		ctx->journal_ioprio =
2347			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
2348		ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
2349		return 0;
2350	case Opt_test_dummy_encryption:
2351		return ext4_parse_test_dummy_encryption(param, ctx);
2352	case Opt_dax:
2353	case Opt_dax_type:
2354#ifdef CONFIG_FS_DAX
2355	{
2356		int type = (token == Opt_dax) ?
2357			   Opt_dax : result.uint_32;
2358
2359		switch (type) {
2360		case Opt_dax:
2361		case Opt_dax_always:
2362			ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2363			ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2364			break;
2365		case Opt_dax_never:
2366			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2367			ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2368			break;
2369		case Opt_dax_inode:
2370			ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2371			ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2372			/* Strictly for printing options */
2373			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
2374			break;
2375		}
2376		return 0;
2377	}
2378#else
2379		ext4_msg(NULL, KERN_INFO, "dax option not supported");
2380		return -EINVAL;
2381#endif
2382	case Opt_data_err:
2383		if (result.uint_32 == Opt_data_err_abort)
2384			ctx_set_mount_opt(ctx, m->mount_opt);
2385		else if (result.uint_32 == Opt_data_err_ignore)
2386			ctx_clear_mount_opt(ctx, m->mount_opt);
2387		return 0;
2388	case Opt_mb_optimize_scan:
2389		if (result.int_32 == 1) {
2390			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
2391			ctx->spec |= EXT4_SPEC_mb_optimize_scan;
2392		} else if (result.int_32 == 0) {
2393			ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
2394			ctx->spec |= EXT4_SPEC_mb_optimize_scan;
2395		} else {
2396			ext4_msg(NULL, KERN_WARNING,
2397				 "mb_optimize_scan should be set to 0 or 1.");
2398			return -EINVAL;
2399		}
2400		return 0;
2401	}
2402
2403	/*
2404	 * At this point we should only be getting options requiring MOPT_SET,
2405	 * or MOPT_CLEAR. Anything else is a bug
2406	 */
2407	if (m->token == Opt_err) {
2408		ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
2409			 param->key);
2410		WARN_ON(1);
2411		return -EINVAL;
2412	}
2413
2414	else {
2415		unsigned int set = 0;
2416
2417		if ((param->type == fs_value_is_flag) ||
2418		    result.uint_32 > 0)
2419			set = 1;
2420
2421		if (m->flags & MOPT_CLEAR)
2422			set = !set;
2423		else if (unlikely(!(m->flags & MOPT_SET))) {
2424			ext4_msg(NULL, KERN_WARNING,
2425				 "buggy handling of option %s",
2426				 param->key);
2427			WARN_ON(1);
2428			return -EINVAL;
2429		}
2430		if (m->flags & MOPT_2) {
2431			if (set != 0)
2432				ctx_set_mount_opt2(ctx, m->mount_opt);
2433			else
2434				ctx_clear_mount_opt2(ctx, m->mount_opt);
2435		} else {
2436			if (set != 0)
2437				ctx_set_mount_opt(ctx, m->mount_opt);
2438			else
2439				ctx_clear_mount_opt(ctx, m->mount_opt);
2440		}
2441	}
2442
2443	return 0;
2444}
2445
2446static int parse_options(struct fs_context *fc, char *options)
2447{
2448	struct fs_parameter param;
2449	int ret;
2450	char *key;
2451
2452	if (!options)
2453		return 0;
2454
2455	while ((key = strsep(&options, ",")) != NULL) {
2456		if (*key) {
2457			size_t v_len = 0;
2458			char *value = strchr(key, '=');
2459
2460			param.type = fs_value_is_flag;
2461			param.string = NULL;
2462
2463			if (value) {
2464				if (value == key)
2465					continue;
2466
2467				*value++ = 0;
2468				v_len = strlen(value);
2469				param.string = kmemdup_nul(value, v_len,
2470							   GFP_KERNEL);
2471				if (!param.string)
2472					return -ENOMEM;
2473				param.type = fs_value_is_string;
2474			}
2475
2476			param.key = key;
2477			param.size = v_len;
2478
2479			ret = ext4_parse_param(fc, &param);
2480			if (param.string)
2481				kfree(param.string);
2482			if (ret < 0)
2483				return ret;
2484		}
2485	}
2486
2487	ret = ext4_validate_options(fc);
2488	if (ret < 0)
2489		return ret;
2490
2491	return 0;
2492}
2493
2494static int parse_apply_sb_mount_options(struct super_block *sb,
2495					struct ext4_fs_context *m_ctx)
2496{
2497	struct ext4_sb_info *sbi = EXT4_SB(sb);
2498	char *s_mount_opts = NULL;
2499	struct ext4_fs_context *s_ctx = NULL;
2500	struct fs_context *fc = NULL;
2501	int ret = -ENOMEM;
2502
2503	if (!sbi->s_es->s_mount_opts[0])
2504		return 0;
2505
2506	s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
2507				sizeof(sbi->s_es->s_mount_opts),
2508				GFP_KERNEL);
2509	if (!s_mount_opts)
2510		return ret;
2511
2512	fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
2513	if (!fc)
2514		goto out_free;
2515
2516	s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
2517	if (!s_ctx)
2518		goto out_free;
2519
2520	fc->fs_private = s_ctx;
2521	fc->s_fs_info = sbi;
2522
2523	ret = parse_options(fc, s_mount_opts);
2524	if (ret < 0)
2525		goto parse_failed;
2526
2527	ret = ext4_check_opt_consistency(fc, sb);
2528	if (ret < 0) {
2529parse_failed:
2530		ext4_msg(sb, KERN_WARNING,
2531			 "failed to parse options in superblock: %s",
2532			 s_mount_opts);
2533		ret = 0;
2534		goto out_free;
2535	}
2536
2537	if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
2538		m_ctx->journal_devnum = s_ctx->journal_devnum;
2539	if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
2540		m_ctx->journal_ioprio = s_ctx->journal_ioprio;
2541
2542	ext4_apply_options(fc, sb);
2543	ret = 0;
2544
2545out_free:
2546	if (fc) {
2547		ext4_fc_free(fc);
2548		kfree(fc);
2549	}
2550	kfree(s_mount_opts);
2551	return ret;
2552}
2553
2554static void ext4_apply_quota_options(struct fs_context *fc,
2555				     struct super_block *sb)
2556{
2557#ifdef CONFIG_QUOTA
2558	bool quota_feature = ext4_has_feature_quota(sb);
2559	struct ext4_fs_context *ctx = fc->fs_private;
2560	struct ext4_sb_info *sbi = EXT4_SB(sb);
2561	char *qname;
2562	int i;
2563
2564	if (quota_feature)
2565		return;
2566
2567	if (ctx->spec & EXT4_SPEC_JQUOTA) {
2568		for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2569			if (!(ctx->qname_spec & (1 << i)))
2570				continue;
2571
2572			qname = ctx->s_qf_names[i]; /* May be NULL */
2573			if (qname)
2574				set_opt(sb, QUOTA);
2575			ctx->s_qf_names[i] = NULL;
2576			qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
2577						lockdep_is_held(&sb->s_umount));
2578			if (qname)
2579				kfree_rcu_mightsleep(qname);
2580		}
2581	}
2582
2583	if (ctx->spec & EXT4_SPEC_JQFMT)
2584		sbi->s_jquota_fmt = ctx->s_jquota_fmt;
2585#endif
2586}
2587
2588/*
2589 * Check quota settings consistency.
2590 */
2591static int ext4_check_quota_consistency(struct fs_context *fc,
2592					struct super_block *sb)
2593{
2594#ifdef CONFIG_QUOTA
2595	struct ext4_fs_context *ctx = fc->fs_private;
2596	struct ext4_sb_info *sbi = EXT4_SB(sb);
2597	bool quota_feature = ext4_has_feature_quota(sb);
2598	bool quota_loaded = sb_any_quota_loaded(sb);
2599	bool usr_qf_name, grp_qf_name, usrquota, grpquota;
2600	int quota_flags, i;
2601
2602	/*
2603	 * We do the test below only for project quotas. 'usrquota' and
2604	 * 'grpquota' mount options are allowed even without quota feature
2605	 * to support legacy quotas in quota files.
2606	 */
2607	if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
2608	    !ext4_has_feature_project(sb)) {
2609		ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
2610			 "Cannot enable project quota enforcement.");
2611		return -EINVAL;
2612	}
2613
2614	quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
2615		      EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
2616	if (quota_loaded &&
2617	    ctx->mask_s_mount_opt & quota_flags &&
2618	    !ctx_test_mount_opt(ctx, quota_flags))
2619		goto err_quota_change;
2620
2621	if (ctx->spec & EXT4_SPEC_JQUOTA) {
2622
2623		for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2624			if (!(ctx->qname_spec & (1 << i)))
2625				continue;
2626
2627			if (quota_loaded &&
2628			    !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
2629				goto err_jquota_change;
2630
2631			if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
2632			    strcmp(get_qf_name(sb, sbi, i),
2633				   ctx->s_qf_names[i]) != 0)
2634				goto err_jquota_specified;
2635		}
2636
2637		if (quota_feature) {
2638			ext4_msg(NULL, KERN_INFO,
2639				 "Journaled quota options ignored when "
2640				 "QUOTA feature is enabled");
2641			return 0;
2642		}
2643	}
2644
2645	if (ctx->spec & EXT4_SPEC_JQFMT) {
2646		if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
2647			goto err_jquota_change;
2648		if (quota_feature) {
2649			ext4_msg(NULL, KERN_INFO, "Quota format mount options "
2650				 "ignored when QUOTA feature is enabled");
2651			return 0;
2652		}
2653	}
2654
2655	/* Make sure we don't mix old and new quota format */
2656	usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
2657		       ctx->s_qf_names[USRQUOTA]);
2658	grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
2659		       ctx->s_qf_names[GRPQUOTA]);
2660
2661	usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
2662		    test_opt(sb, USRQUOTA));
2663
2664	grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
2665		    test_opt(sb, GRPQUOTA));
2666
2667	if (usr_qf_name) {
2668		ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
2669		usrquota = false;
2670	}
2671	if (grp_qf_name) {
2672		ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
2673		grpquota = false;
2674	}
2675
2676	if (usr_qf_name || grp_qf_name) {
2677		if (usrquota || grpquota) {
2678			ext4_msg(NULL, KERN_ERR, "old and new quota "
2679				 "format mixing");
2680			return -EINVAL;
2681		}
2682
2683		if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
2684			ext4_msg(NULL, KERN_ERR, "journaled quota format "
2685				 "not specified");
2686			return -EINVAL;
2687		}
2688	}
2689
2690	return 0;
2691
2692err_quota_change:
2693	ext4_msg(NULL, KERN_ERR,
2694		 "Cannot change quota options when quota turned on");
2695	return -EINVAL;
2696err_jquota_change:
2697	ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
2698		 "options when quota turned on");
2699	return -EINVAL;
2700err_jquota_specified:
2701	ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
2702		 QTYPE2NAME(i));
2703	return -EINVAL;
2704#else
2705	return 0;
2706#endif
2707}
2708
2709static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
2710					    struct super_block *sb)
2711{
2712	const struct ext4_fs_context *ctx = fc->fs_private;
2713	const struct ext4_sb_info *sbi = EXT4_SB(sb);
2714
2715	if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
2716		return 0;
2717
2718	if (!ext4_has_feature_encrypt(sb)) {
2719		ext4_msg(NULL, KERN_WARNING,
2720			 "test_dummy_encryption requires encrypt feature");
2721		return -EINVAL;
2722	}
2723	/*
2724	 * This mount option is just for testing, and it's not worthwhile to
2725	 * implement the extra complexity (e.g. RCU protection) that would be
2726	 * needed to allow it to be set or changed during remount.  We do allow
2727	 * it to be specified during remount, but only if there is no change.
2728	 */
2729	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
2730		if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
2731						 &ctx->dummy_enc_policy))
2732			return 0;
2733		ext4_msg(NULL, KERN_WARNING,
2734			 "Can't set or change test_dummy_encryption on remount");
2735		return -EINVAL;
2736	}
2737	/* Also make sure s_mount_opts didn't contain a conflicting value. */
2738	if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
2739		if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
2740						 &ctx->dummy_enc_policy))
2741			return 0;
2742		ext4_msg(NULL, KERN_WARNING,
2743			 "Conflicting test_dummy_encryption options");
2744		return -EINVAL;
2745	}
2746	return 0;
2747}
2748
2749static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
2750					     struct super_block *sb)
2751{
2752	if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
2753	    /* if already set, it was already verified to be the same */
2754	    fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
2755		return;
2756	EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
2757	memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
2758	ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
2759}
2760
2761static int ext4_check_opt_consistency(struct fs_context *fc,
2762				      struct super_block *sb)
2763{
2764	struct ext4_fs_context *ctx = fc->fs_private;
2765	struct ext4_sb_info *sbi = fc->s_fs_info;
2766	int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
2767	int err;
2768
2769	if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
2770		ext4_msg(NULL, KERN_ERR,
2771			 "Mount option(s) incompatible with ext2");
2772		return -EINVAL;
2773	}
2774	if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
2775		ext4_msg(NULL, KERN_ERR,
2776			 "Mount option(s) incompatible with ext3");
2777		return -EINVAL;
2778	}
2779
2780	if (ctx->s_want_extra_isize >
2781	    (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
2782		ext4_msg(NULL, KERN_ERR,
2783			 "Invalid want_extra_isize %d",
2784			 ctx->s_want_extra_isize);
2785		return -EINVAL;
2786	}
2787
2788	if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
2789		int blocksize =
2790			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
2791		if (blocksize < PAGE_SIZE)
2792			ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
2793				 "experimental mount option 'dioread_nolock' "
2794				 "for blocksize < PAGE_SIZE");
2795	}
2796
2797	err = ext4_check_test_dummy_encryption(fc, sb);
2798	if (err)
2799		return err;
2800
2801	if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
2802		if (!sbi->s_journal) {
2803			ext4_msg(NULL, KERN_WARNING,
2804				 "Remounting file system with no journal "
2805				 "so ignoring journalled data option");
2806			ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
2807		} else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
2808			   test_opt(sb, DATA_FLAGS)) {
2809			ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
2810				 "on remount");
2811			return -EINVAL;
2812		}
2813	}
2814
2815	if (is_remount) {
2816		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
2817		    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2818			ext4_msg(NULL, KERN_ERR, "can't mount with "
2819				 "both data=journal and dax");
2820			return -EINVAL;
2821		}
2822
2823		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
2824		    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2825		     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
2826fail_dax_change_remount:
2827			ext4_msg(NULL, KERN_ERR, "can't change "
2828				 "dax mount option while remounting");
2829			return -EINVAL;
2830		} else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
2831			 (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2832			  (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
2833			goto fail_dax_change_remount;
2834		} else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
2835			   ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2836			    (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2837			    !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
2838			goto fail_dax_change_remount;
2839		}
2840	}
2841
2842	return ext4_check_quota_consistency(fc, sb);
2843}
2844
2845static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
2846{
2847	struct ext4_fs_context *ctx = fc->fs_private;
2848	struct ext4_sb_info *sbi = fc->s_fs_info;
2849
2850	sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
2851	sbi->s_mount_opt |= ctx->vals_s_mount_opt;
2852	sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
2853	sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
2854	sb->s_flags &= ~ctx->mask_s_flags;
2855	sb->s_flags |= ctx->vals_s_flags;
2856
2857#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
2858	APPLY(s_commit_interval);
2859	APPLY(s_stripe);
2860	APPLY(s_max_batch_time);
2861	APPLY(s_min_batch_time);
2862	APPLY(s_want_extra_isize);
2863	APPLY(s_inode_readahead_blks);
2864	APPLY(s_max_dir_size_kb);
2865	APPLY(s_li_wait_mult);
2866	APPLY(s_resgid);
2867	APPLY(s_resuid);
2868
2869#ifdef CONFIG_EXT4_DEBUG
2870	APPLY(s_fc_debug_max_replay);
2871#endif
2872
2873	ext4_apply_quota_options(fc, sb);
2874	ext4_apply_test_dummy_encryption(ctx, sb);
2875}
2876
2877
2878static int ext4_validate_options(struct fs_context *fc)
2879{
2880#ifdef CONFIG_QUOTA
2881	struct ext4_fs_context *ctx = fc->fs_private;
2882	char *usr_qf_name, *grp_qf_name;
2883
2884	usr_qf_name = ctx->s_qf_names[USRQUOTA];
2885	grp_qf_name = ctx->s_qf_names[GRPQUOTA];
2886
2887	if (usr_qf_name || grp_qf_name) {
2888		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
2889			ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
2890
2891		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
2892			ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
2893
2894		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
2895		    ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
2896			ext4_msg(NULL, KERN_ERR, "old and new quota "
2897				 "format mixing");
2898			return -EINVAL;
2899		}
2900	}
2901#endif
2902	return 1;
2903}
2904
2905static inline void ext4_show_quota_options(struct seq_file *seq,
2906					   struct super_block *sb)
2907{
2908#if defined(CONFIG_QUOTA)
2909	struct ext4_sb_info *sbi = EXT4_SB(sb);
2910	char *usr_qf_name, *grp_qf_name;
2911
2912	if (sbi->s_jquota_fmt) {
2913		char *fmtname = "";
2914
2915		switch (sbi->s_jquota_fmt) {
2916		case QFMT_VFS_OLD:
2917			fmtname = "vfsold";
2918			break;
2919		case QFMT_VFS_V0:
2920			fmtname = "vfsv0";
2921			break;
2922		case QFMT_VFS_V1:
2923			fmtname = "vfsv1";
2924			break;
2925		}
2926		seq_printf(seq, ",jqfmt=%s", fmtname);
2927	}
2928
2929	rcu_read_lock();
2930	usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2931	grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2932	if (usr_qf_name)
2933		seq_show_option(seq, "usrjquota", usr_qf_name);
2934	if (grp_qf_name)
2935		seq_show_option(seq, "grpjquota", grp_qf_name);
2936	rcu_read_unlock();
2937#endif
2938}
2939
2940static const char *token2str(int token)
2941{
2942	const struct fs_parameter_spec *spec;
2943
2944	for (spec = ext4_param_specs; spec->name != NULL; spec++)
2945		if (spec->opt == token && !spec->type)
2946			break;
2947	return spec->name;
2948}
2949
2950/*
2951 * Show an option if
2952 *  - it's set to a non-default value OR
2953 *  - if the per-sb default is different from the global default
2954 */
2955static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2956			      int nodefs)
2957{
2958	struct ext4_sb_info *sbi = EXT4_SB(sb);
2959	struct ext4_super_block *es = sbi->s_es;
2960	int def_errors;
2961	const struct mount_opts *m;
2962	char sep = nodefs ? '\n' : ',';
2963
2964#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2965#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2966
2967	if (sbi->s_sb_block != 1)
2968		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2969
2970	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2971		int want_set = m->flags & MOPT_SET;
2972		int opt_2 = m->flags & MOPT_2;
2973		unsigned int mount_opt, def_mount_opt;
2974
2975		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2976		    m->flags & MOPT_SKIP)
2977			continue;
2978
2979		if (opt_2) {
2980			mount_opt = sbi->s_mount_opt2;
2981			def_mount_opt = sbi->s_def_mount_opt2;
2982		} else {
2983			mount_opt = sbi->s_mount_opt;
2984			def_mount_opt = sbi->s_def_mount_opt;
2985		}
2986		/* skip if same as the default */
2987		if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
2988			continue;
2989		/* select Opt_noFoo vs Opt_Foo */
2990		if ((want_set &&
2991		     (mount_opt & m->mount_opt) != m->mount_opt) ||
2992		    (!want_set && (mount_opt & m->mount_opt)))
2993			continue;
2994		SEQ_OPTS_PRINT("%s", token2str(m->token));
2995	}
2996
2997	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2998	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
2999		SEQ_OPTS_PRINT("resuid=%u",
3000				from_kuid_munged(&init_user_ns, sbi->s_resuid));
3001	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
3002	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
3003		SEQ_OPTS_PRINT("resgid=%u",
3004				from_kgid_munged(&init_user_ns, sbi->s_resgid));
3005	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
3006	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
3007		SEQ_OPTS_PUTS("errors=remount-ro");
3008	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
3009		SEQ_OPTS_PUTS("errors=continue");
3010	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
3011		SEQ_OPTS_PUTS("errors=panic");
3012	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
3013		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
3014	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
3015		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
3016	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
3017		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
3018	if (nodefs || sbi->s_stripe)
3019		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
3020	if (nodefs || EXT4_MOUNT_DATA_FLAGS &
3021			(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
3022		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
3023			SEQ_OPTS_PUTS("data=journal");
3024		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
3025			SEQ_OPTS_PUTS("data=ordered");
3026		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
3027			SEQ_OPTS_PUTS("data=writeback");
3028	}
3029	if (nodefs ||
3030	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
3031		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
3032			       sbi->s_inode_readahead_blks);
3033
3034	if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
3035		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
3036		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
3037	if (nodefs || sbi->s_max_dir_size_kb)
3038		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
3039	if (test_opt(sb, DATA_ERR_ABORT))
3040		SEQ_OPTS_PUTS("data_err=abort");
3041
3042	fscrypt_show_test_dummy_encryption(seq, sep, sb);
3043
3044	if (sb->s_flags & SB_INLINECRYPT)
3045		SEQ_OPTS_PUTS("inlinecrypt");
3046
3047	if (test_opt(sb, DAX_ALWAYS)) {
3048		if (IS_EXT2_SB(sb))
3049			SEQ_OPTS_PUTS("dax");
3050		else
3051			SEQ_OPTS_PUTS("dax=always");
3052	} else if (test_opt2(sb, DAX_NEVER)) {
3053		SEQ_OPTS_PUTS("dax=never");
3054	} else if (test_opt2(sb, DAX_INODE)) {
3055		SEQ_OPTS_PUTS("dax=inode");
3056	}
3057
3058	if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
3059			!test_opt2(sb, MB_OPTIMIZE_SCAN)) {
3060		SEQ_OPTS_PUTS("mb_optimize_scan=0");
3061	} else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
3062			test_opt2(sb, MB_OPTIMIZE_SCAN)) {
3063		SEQ_OPTS_PUTS("mb_optimize_scan=1");
3064	}
3065
3066	ext4_show_quota_options(seq, sb);
3067	return 0;
3068}
3069
3070static int ext4_show_options(struct seq_file *seq, struct dentry *root)
3071{
3072	return _ext4_show_options(seq, root->d_sb, 0);
3073}
3074
3075int ext4_seq_options_show(struct seq_file *seq, void *offset)
3076{
3077	struct super_block *sb = seq->private;
3078	int rc;
3079
3080	seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
3081	rc = _ext4_show_options(seq, sb, 1);
3082	seq_puts(seq, "\n");
3083	return rc;
3084}
3085
3086static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
3087			    int read_only)
3088{
3089	struct ext4_sb_info *sbi = EXT4_SB(sb);
3090	int err = 0;
3091
3092	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
3093		ext4_msg(sb, KERN_ERR, "revision level too high, "
3094			 "forcing read-only mode");
3095		err = -EROFS;
3096		goto done;
3097	}
3098	if (read_only)
3099		goto done;
3100	if (!(sbi->s_mount_state & EXT4_VALID_FS))
3101		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
3102			 "running e2fsck is recommended");
3103	else if (sbi->s_mount_state & EXT4_ERROR_FS)
3104		ext4_msg(sb, KERN_WARNING,
3105			 "warning: mounting fs with errors, "
3106			 "running e2fsck is recommended");
3107	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
3108		 le16_to_cpu(es->s_mnt_count) >=
3109		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
3110		ext4_msg(sb, KERN_WARNING,
3111			 "warning: maximal mount count reached, "
3112			 "running e2fsck is recommended");
3113	else if (le32_to_cpu(es->s_checkinterval) &&
3114		 (ext4_get_tstamp(es, s_lastcheck) +
3115		  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
3116		ext4_msg(sb, KERN_WARNING,
3117			 "warning: checktime reached, "
3118			 "running e2fsck is recommended");
3119	if (!sbi->s_journal)
3120		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
3121	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
3122		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
3123	le16_add_cpu(&es->s_mnt_count, 1);
3124	ext4_update_tstamp(es, s_mtime);
3125	if (sbi->s_journal) {
3126		ext4_set_feature_journal_needs_recovery(sb);
3127		if (ext4_has_feature_orphan_file(sb))
3128			ext4_set_feature_orphan_present(sb);
3129	}
3130
3131	err = ext4_commit_super(sb);
3132done:
3133	if (test_opt(sb, DEBUG))
3134		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
3135				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
3136			sb->s_blocksize,
3137			sbi->s_groups_count,
3138			EXT4_BLOCKS_PER_GROUP(sb),
3139			EXT4_INODES_PER_GROUP(sb),
3140			sbi->s_mount_opt, sbi->s_mount_opt2);
3141	return err;
3142}
3143
3144int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
3145{
3146	struct ext4_sb_info *sbi = EXT4_SB(sb);
3147	struct flex_groups **old_groups, **new_groups;
3148	int size, i, j;
3149
3150	if (!sbi->s_log_groups_per_flex)
3151		return 0;
3152
3153	size = ext4_flex_group(sbi, ngroup - 1) + 1;
3154	if (size <= sbi->s_flex_groups_allocated)
3155		return 0;
3156
3157	new_groups = kvzalloc(roundup_pow_of_two(size *
3158			      sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
3159	if (!new_groups) {
3160		ext4_msg(sb, KERN_ERR,
3161			 "not enough memory for %d flex group pointers", size);
3162		return -ENOMEM;
3163	}
3164	for (i = sbi->s_flex_groups_allocated; i < size; i++) {
3165		new_groups[i] = kvzalloc(roundup_pow_of_two(
3166					 sizeof(struct flex_groups)),
3167					 GFP_KERNEL);
3168		if (!new_groups[i]) {
3169			for (j = sbi->s_flex_groups_allocated; j < i; j++)
3170				kvfree(new_groups[j]);
3171			kvfree(new_groups);
3172			ext4_msg(sb, KERN_ERR,
3173				 "not enough memory for %d flex groups", size);
3174			return -ENOMEM;
3175		}
3176	}
3177	rcu_read_lock();
3178	old_groups = rcu_dereference(sbi->s_flex_groups);
3179	if (old_groups)
3180		memcpy(new_groups, old_groups,
3181		       (sbi->s_flex_groups_allocated *
3182			sizeof(struct flex_groups *)));
3183	rcu_read_unlock();
3184	rcu_assign_pointer(sbi->s_flex_groups, new_groups);
3185	sbi->s_flex_groups_allocated = size;
3186	if (old_groups)
3187		ext4_kvfree_array_rcu(old_groups);
3188	return 0;
3189}
3190
3191static int ext4_fill_flex_info(struct super_block *sb)
3192{
3193	struct ext4_sb_info *sbi = EXT4_SB(sb);
3194	struct ext4_group_desc *gdp = NULL;
3195	struct flex_groups *fg;
3196	ext4_group_t flex_group;
3197	int i, err;
3198
3199	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
3200	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
3201		sbi->s_log_groups_per_flex = 0;
3202		return 1;
3203	}
3204
3205	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
3206	if (err)
3207		goto failed;
3208
3209	for (i = 0; i < sbi->s_groups_count; i++) {
3210		gdp = ext4_get_group_desc(sb, i, NULL);
3211
3212		flex_group = ext4_flex_group(sbi, i);
3213		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
3214		atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
3215		atomic64_add(ext4_free_group_clusters(sb, gdp),
3216			     &fg->free_clusters);
3217		atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
3218	}
3219
3220	return 1;
3221failed:
3222	return 0;
3223}
3224
3225static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
3226				   struct ext4_group_desc *gdp)
3227{
3228	int offset = offsetof(struct ext4_group_desc, bg_checksum);
3229	__u16 crc = 0;
3230	__le32 le_group = cpu_to_le32(block_group);
3231	struct ext4_sb_info *sbi = EXT4_SB(sb);
3232
3233	if (ext4_has_metadata_csum(sbi->s_sb)) {
3234		/* Use new metadata_csum algorithm */
3235		__u32 csum32;
3236		__u16 dummy_csum = 0;
3237
3238		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
3239				     sizeof(le_group));
3240		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
3241		csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
3242				     sizeof(dummy_csum));
3243		offset += sizeof(dummy_csum);
3244		if (offset < sbi->s_desc_size)
3245			csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
3246					     sbi->s_desc_size - offset);
3247
3248		crc = csum32 & 0xFFFF;
3249		goto out;
3250	}
3251
3252	/* old crc16 code */
3253	if (!ext4_has_feature_gdt_csum(sb))
3254		return 0;
3255
3256	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
3257	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
3258	crc = crc16(crc, (__u8 *)gdp, offset);
3259	offset += sizeof(gdp->bg_checksum); /* skip checksum */
3260	/* for checksum of struct ext4_group_desc do the rest...*/
3261	if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
3262		crc = crc16(crc, (__u8 *)gdp + offset,
3263			    sbi->s_desc_size - offset);
3264
3265out:
3266	return cpu_to_le16(crc);
3267}
3268
3269int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
3270				struct ext4_group_desc *gdp)
3271{
3272	if (ext4_has_group_desc_csum(sb) &&
3273	    (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
3274		return 0;
3275
3276	return 1;
3277}
3278
3279void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
3280			      struct ext4_group_desc *gdp)
3281{
3282	if (!ext4_has_group_desc_csum(sb))
3283		return;
3284	gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
3285}
3286
3287/* Called at mount-time, super-block is locked */
3288static int ext4_check_descriptors(struct super_block *sb,
3289				  ext4_fsblk_t sb_block,
3290				  ext4_group_t *first_not_zeroed)
3291{
3292	struct ext4_sb_info *sbi = EXT4_SB(sb);
3293	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
3294	ext4_fsblk_t last_block;
3295	ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
3296	ext4_fsblk_t block_bitmap;
3297	ext4_fsblk_t inode_bitmap;
3298	ext4_fsblk_t inode_table;
3299	int flexbg_flag = 0;
3300	ext4_group_t i, grp = sbi->s_groups_count;
3301
3302	if (ext4_has_feature_flex_bg(sb))
3303		flexbg_flag = 1;
3304
3305	ext4_debug("Checking group descriptors");
3306
3307	for (i = 0; i < sbi->s_groups_count; i++) {
3308		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
3309
3310		if (i == sbi->s_groups_count - 1 || flexbg_flag)
3311			last_block = ext4_blocks_count(sbi->s_es) - 1;
3312		else
3313			last_block = first_block +
3314				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
3315
3316		if ((grp == sbi->s_groups_count) &&
3317		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3318			grp = i;
3319
3320		block_bitmap = ext4_block_bitmap(sb, gdp);
3321		if (block_bitmap == sb_block) {
3322			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3323				 "Block bitmap for group %u overlaps "
3324				 "superblock", i);
3325			if (!sb_rdonly(sb))
3326				return 0;
3327		}
3328		if (block_bitmap >= sb_block + 1 &&
3329		    block_bitmap <= last_bg_block) {
3330			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3331				 "Block bitmap for group %u overlaps "
3332				 "block group descriptors", i);
3333			if (!sb_rdonly(sb))
3334				return 0;
3335		}
3336		if (block_bitmap < first_block || block_bitmap > last_block) {
3337			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3338			       "Block bitmap for group %u not in group "
3339			       "(block %llu)!", i, block_bitmap);
3340			return 0;
3341		}
3342		inode_bitmap = ext4_inode_bitmap(sb, gdp);
3343		if (inode_bitmap == sb_block) {
3344			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3345				 "Inode bitmap for group %u overlaps "
3346				 "superblock", i);
3347			if (!sb_rdonly(sb))
3348				return 0;
3349		}
3350		if (inode_bitmap >= sb_block + 1 &&
3351		    inode_bitmap <= last_bg_block) {
3352			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3353				 "Inode bitmap for group %u overlaps "
3354				 "block group descriptors", i);
3355			if (!sb_rdonly(sb))
3356				return 0;
3357		}
3358		if (inode_bitmap < first_block || inode_bitmap > last_block) {
3359			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3360			       "Inode bitmap for group %u not in group "
3361			       "(block %llu)!", i, inode_bitmap);
3362			return 0;
3363		}
3364		inode_table = ext4_inode_table(sb, gdp);
3365		if (inode_table == sb_block) {
3366			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3367				 "Inode table for group %u overlaps "
3368				 "superblock", i);
3369			if (!sb_rdonly(sb))
3370				return 0;
3371		}
3372		if (inode_table >= sb_block + 1 &&
3373		    inode_table <= last_bg_block) {
3374			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3375				 "Inode table for group %u overlaps "
3376				 "block group descriptors", i);
3377			if (!sb_rdonly(sb))
3378				return 0;
3379		}
3380		if (inode_table < first_block ||
3381		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
3382			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3383			       "Inode table for group %u not in group "
3384			       "(block %llu)!", i, inode_table);
3385			return 0;
3386		}
3387		ext4_lock_group(sb, i);
3388		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
3389			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3390				 "Checksum for group %u failed (%u!=%u)",
3391				 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
3392				     gdp)), le16_to_cpu(gdp->bg_checksum));
3393			if (!sb_rdonly(sb)) {
3394				ext4_unlock_group(sb, i);
3395				return 0;
3396			}
3397		}
3398		ext4_unlock_group(sb, i);
3399		if (!flexbg_flag)
3400			first_block += EXT4_BLOCKS_PER_GROUP(sb);
3401	}
3402	if (NULL != first_not_zeroed)
3403		*first_not_zeroed = grp;
3404	return 1;
3405}
3406
3407/*
3408 * Maximal extent format file size.
3409 * Resulting logical blkno at s_maxbytes must fit in our on-disk
3410 * extent format containers, within a sector_t, and within i_blocks
3411 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
3412 * so that won't be a limiting factor.
3413 *
3414 * However there is other limiting factor. We do store extents in the form
3415 * of starting block and length, hence the resulting length of the extent
3416 * covering maximum file size must fit into on-disk format containers as
3417 * well. Given that length is always by 1 unit bigger than max unit (because
3418 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
3419 *
3420 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
3421 */
3422static loff_t ext4_max_size(int blkbits, int has_huge_files)
3423{
3424	loff_t res;
3425	loff_t upper_limit = MAX_LFS_FILESIZE;
3426
3427	BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
3428
3429	if (!has_huge_files) {
3430		upper_limit = (1LL << 32) - 1;
3431
3432		/* total blocks in file system block size */
3433		upper_limit >>= (blkbits - 9);
3434		upper_limit <<= blkbits;
3435	}
3436
3437	/*
3438	 * 32-bit extent-start container, ee_block. We lower the maxbytes
3439	 * by one fs block, so ee_len can cover the extent of maximum file
3440	 * size
3441	 */
3442	res = (1LL << 32) - 1;
3443	res <<= blkbits;
3444
3445	/* Sanity check against vm- & vfs- imposed limits */
3446	if (res > upper_limit)
3447		res = upper_limit;
3448
3449	return res;
3450}
3451
3452/*
3453 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
3454 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
3455 * We need to be 1 filesystem block less than the 2^48 sector limit.
3456 */
3457static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
3458{
3459	loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
3460	int meta_blocks;
3461	unsigned int ppb = 1 << (bits - 2);
3462
3463	/*
3464	 * This is calculated to be the largest file size for a dense, block
3465	 * mapped file such that the file's total number of 512-byte sectors,
3466	 * including data and all indirect blocks, does not exceed (2^48 - 1).
3467	 *
3468	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
3469	 * number of 512-byte sectors of the file.
3470	 */
3471	if (!has_huge_files) {
3472		/*
3473		 * !has_huge_files or implies that the inode i_block field
3474		 * represents total file blocks in 2^32 512-byte sectors ==
3475		 * size of vfs inode i_blocks * 8
3476		 */
3477		upper_limit = (1LL << 32) - 1;
3478
3479		/* total blocks in file system block size */
3480		upper_limit >>= (bits - 9);
3481
3482	} else {
3483		/*
3484		 * We use 48 bit ext4_inode i_blocks
3485		 * With EXT4_HUGE_FILE_FL set the i_blocks
3486		 * represent total number of blocks in
3487		 * file system block size
3488		 */
3489		upper_limit = (1LL << 48) - 1;
3490
3491	}
3492
3493	/* Compute how many blocks we can address by block tree */
3494	res += ppb;
3495	res += ppb * ppb;
3496	res += ((loff_t)ppb) * ppb * ppb;
3497	/* Compute how many metadata blocks are needed */
3498	meta_blocks = 1;
3499	meta_blocks += 1 + ppb;
3500	meta_blocks += 1 + ppb + ppb * ppb;
3501	/* Does block tree limit file size? */
3502	if (res + meta_blocks <= upper_limit)
3503		goto check_lfs;
3504
3505	res = upper_limit;
3506	/* How many metadata blocks are needed for addressing upper_limit? */
3507	upper_limit -= EXT4_NDIR_BLOCKS;
3508	/* indirect blocks */
3509	meta_blocks = 1;
3510	upper_limit -= ppb;
3511	/* double indirect blocks */
3512	if (upper_limit < ppb * ppb) {
3513		meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
3514		res -= meta_blocks;
3515		goto check_lfs;
3516	}
3517	meta_blocks += 1 + ppb;
3518	upper_limit -= ppb * ppb;
3519	/* tripple indirect blocks for the rest */
3520	meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
3521		DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
3522	res -= meta_blocks;
3523check_lfs:
3524	res <<= bits;
3525	if (res > MAX_LFS_FILESIZE)
3526		res = MAX_LFS_FILESIZE;
3527
3528	return res;
3529}
3530
3531static ext4_fsblk_t descriptor_loc(struct super_block *sb,
3532				   ext4_fsblk_t logical_sb_block, int nr)
3533{
3534	struct ext4_sb_info *sbi = EXT4_SB(sb);
3535	ext4_group_t bg, first_meta_bg;
3536	int has_super = 0;
3537
3538	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
3539
3540	if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
3541		return logical_sb_block + nr + 1;
3542	bg = sbi->s_desc_per_block * nr;
3543	if (ext4_bg_has_super(sb, bg))
3544		has_super = 1;
3545
3546	/*
3547	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
3548	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
3549	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
3550	 * compensate.
3551	 */
3552	if (sb->s_blocksize == 1024 && nr == 0 &&
3553	    le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
3554		has_super++;
3555
3556	return (has_super + ext4_group_first_block_no(sb, bg));
3557}
3558
3559/**
3560 * ext4_get_stripe_size: Get the stripe size.
3561 * @sbi: In memory super block info
3562 *
3563 * If we have specified it via mount option, then
3564 * use the mount option value. If the value specified at mount time is
3565 * greater than the blocks per group use the super block value.
3566 * If the super block value is greater than blocks per group return 0.
3567 * Allocator needs it be less than blocks per group.
3568 *
3569 */
3570static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
3571{
3572	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
3573	unsigned long stripe_width =
3574			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
3575	int ret;
3576
3577	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
3578		ret = sbi->s_stripe;
3579	else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
3580		ret = stripe_width;
3581	else if (stride && stride <= sbi->s_blocks_per_group)
3582		ret = stride;
3583	else
3584		ret = 0;
3585
3586	/*
3587	 * If the stripe width is 1, this makes no sense and
3588	 * we set it to 0 to turn off stripe handling code.
3589	 */
3590	if (ret <= 1)
3591		ret = 0;
3592
3593	return ret;
3594}
3595
3596/*
3597 * Check whether this filesystem can be mounted based on
3598 * the features present and the RDONLY/RDWR mount requested.
3599 * Returns 1 if this filesystem can be mounted as requested,
3600 * 0 if it cannot be.
3601 */
3602int ext4_feature_set_ok(struct super_block *sb, int readonly)
3603{
3604	if (ext4_has_unknown_ext4_incompat_features(sb)) {
3605		ext4_msg(sb, KERN_ERR,
3606			"Couldn't mount because of "
3607			"unsupported optional features (%x)",
3608			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
3609			~EXT4_FEATURE_INCOMPAT_SUPP));
3610		return 0;
3611	}
3612
3613#if !IS_ENABLED(CONFIG_UNICODE)
3614	if (ext4_has_feature_casefold(sb)) {
3615		ext4_msg(sb, KERN_ERR,
3616			 "Filesystem with casefold feature cannot be "
3617			 "mounted without CONFIG_UNICODE");
3618		return 0;
3619	}
3620#endif
3621
3622	if (readonly)
3623		return 1;
3624
3625	if (ext4_has_feature_readonly(sb)) {
3626		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
3627		sb->s_flags |= SB_RDONLY;
3628		return 1;
3629	}
3630
3631	/* Check that feature set is OK for a read-write mount */
3632	if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
3633		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
3634			 "unsupported optional features (%x)",
3635			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
3636				~EXT4_FEATURE_RO_COMPAT_SUPP));
3637		return 0;
3638	}
3639	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
3640		ext4_msg(sb, KERN_ERR,
3641			 "Can't support bigalloc feature without "
3642			 "extents feature\n");
3643		return 0;
3644	}
3645
3646#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
3647	if (!readonly && (ext4_has_feature_quota(sb) ||
3648			  ext4_has_feature_project(sb))) {
3649		ext4_msg(sb, KERN_ERR,
3650			 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
3651		return 0;
3652	}
3653#endif  /* CONFIG_QUOTA */
3654	return 1;
3655}
3656
3657/*
3658 * This function is called once a day if we have errors logged
3659 * on the file system
3660 */
3661static void print_daily_error_info(struct timer_list *t)
3662{
3663	struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
3664	struct super_block *sb = sbi->s_sb;
3665	struct ext4_super_block *es = sbi->s_es;
3666
3667	if (es->s_error_count)
3668		/* fsck newer than v1.41.13 is needed to clean this condition. */
3669		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
3670			 le32_to_cpu(es->s_error_count));
3671	if (es->s_first_error_time) {
3672		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
3673		       sb->s_id,
3674		       ext4_get_tstamp(es, s_first_error_time),
3675		       (int) sizeof(es->s_first_error_func),
3676		       es->s_first_error_func,
3677		       le32_to_cpu(es->s_first_error_line));
3678		if (es->s_first_error_ino)
3679			printk(KERN_CONT ": inode %u",
3680			       le32_to_cpu(es->s_first_error_ino));
3681		if (es->s_first_error_block)
3682			printk(KERN_CONT ": block %llu", (unsigned long long)
3683			       le64_to_cpu(es->s_first_error_block));
3684		printk(KERN_CONT "\n");
3685	}
3686	if (es->s_last_error_time) {
3687		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
3688		       sb->s_id,
3689		       ext4_get_tstamp(es, s_last_error_time),
3690		       (int) sizeof(es->s_last_error_func),
3691		       es->s_last_error_func,
3692		       le32_to_cpu(es->s_last_error_line));
3693		if (es->s_last_error_ino)
3694			printk(KERN_CONT ": inode %u",
3695			       le32_to_cpu(es->s_last_error_ino));
3696		if (es->s_last_error_block)
3697			printk(KERN_CONT ": block %llu", (unsigned long long)
3698			       le64_to_cpu(es->s_last_error_block));
3699		printk(KERN_CONT "\n");
3700	}
3701	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
3702}
3703
3704/* Find next suitable group and run ext4_init_inode_table */
3705static int ext4_run_li_request(struct ext4_li_request *elr)
3706{
3707	struct ext4_group_desc *gdp = NULL;
3708	struct super_block *sb = elr->lr_super;
3709	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3710	ext4_group_t group = elr->lr_next_group;
3711	unsigned int prefetch_ios = 0;
3712	int ret = 0;
3713	int nr = EXT4_SB(sb)->s_mb_prefetch;
3714	u64 start_time;
3715
3716	if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
3717		elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
3718		ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
3719		trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
3720		if (group >= elr->lr_next_group) {
3721			ret = 1;
3722			if (elr->lr_first_not_zeroed != ngroups &&
3723			    !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
3724				elr->lr_next_group = elr->lr_first_not_zeroed;
3725				elr->lr_mode = EXT4_LI_MODE_ITABLE;
3726				ret = 0;
3727			}
3728		}
3729		return ret;
3730	}
3731
3732	for (; group < ngroups; group++) {
3733		gdp = ext4_get_group_desc(sb, group, NULL);
3734		if (!gdp) {
3735			ret = 1;
3736			break;
3737		}
3738
3739		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3740			break;
3741	}
3742
3743	if (group >= ngroups)
3744		ret = 1;
3745
3746	if (!ret) {
3747		start_time = ktime_get_real_ns();
3748		ret = ext4_init_inode_table(sb, group,
3749					    elr->lr_timeout ? 0 : 1);
3750		trace_ext4_lazy_itable_init(sb, group);
3751		if (elr->lr_timeout == 0) {
3752			elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
3753				EXT4_SB(elr->lr_super)->s_li_wait_mult);
3754		}
3755		elr->lr_next_sched = jiffies + elr->lr_timeout;
3756		elr->lr_next_group = group + 1;
3757	}
3758	return ret;
3759}
3760
3761/*
3762 * Remove lr_request from the list_request and free the
3763 * request structure. Should be called with li_list_mtx held
3764 */
3765static void ext4_remove_li_request(struct ext4_li_request *elr)
3766{
3767	if (!elr)
3768		return;
3769
3770	list_del(&elr->lr_request);
3771	EXT4_SB(elr->lr_super)->s_li_request = NULL;
3772	kfree(elr);
3773}
3774
3775static void ext4_unregister_li_request(struct super_block *sb)
3776{
3777	mutex_lock(&ext4_li_mtx);
3778	if (!ext4_li_info) {
3779		mutex_unlock(&ext4_li_mtx);
3780		return;
3781	}
3782
3783	mutex_lock(&ext4_li_info->li_list_mtx);
3784	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3785	mutex_unlock(&ext4_li_info->li_list_mtx);
3786	mutex_unlock(&ext4_li_mtx);
3787}
3788
3789static struct task_struct *ext4_lazyinit_task;
3790
3791/*
3792 * This is the function where ext4lazyinit thread lives. It walks
3793 * through the request list searching for next scheduled filesystem.
3794 * When such a fs is found, run the lazy initialization request
3795 * (ext4_rn_li_request) and keep track of the time spend in this
3796 * function. Based on that time we compute next schedule time of
3797 * the request. When walking through the list is complete, compute
3798 * next waking time and put itself into sleep.
3799 */
3800static int ext4_lazyinit_thread(void *arg)
3801{
3802	struct ext4_lazy_init *eli = arg;
3803	struct list_head *pos, *n;
3804	struct ext4_li_request *elr;
3805	unsigned long next_wakeup, cur;
3806
3807	BUG_ON(NULL == eli);
3808	set_freezable();
3809
3810cont_thread:
3811	while (true) {
3812		next_wakeup = MAX_JIFFY_OFFSET;
3813
3814		mutex_lock(&eli->li_list_mtx);
3815		if (list_empty(&eli->li_request_list)) {
3816			mutex_unlock(&eli->li_list_mtx);
3817			goto exit_thread;
3818		}
3819		list_for_each_safe(pos, n, &eli->li_request_list) {
3820			int err = 0;
3821			int progress = 0;
3822			elr = list_entry(pos, struct ext4_li_request,
3823					 lr_request);
3824
3825			if (time_before(jiffies, elr->lr_next_sched)) {
3826				if (time_before(elr->lr_next_sched, next_wakeup))
3827					next_wakeup = elr->lr_next_sched;
3828				continue;
3829			}
3830			if (down_read_trylock(&elr->lr_super->s_umount)) {
3831				if (sb_start_write_trylock(elr->lr_super)) {
3832					progress = 1;
3833					/*
3834					 * We hold sb->s_umount, sb can not
3835					 * be removed from the list, it is
3836					 * now safe to drop li_list_mtx
3837					 */
3838					mutex_unlock(&eli->li_list_mtx);
3839					err = ext4_run_li_request(elr);
3840					sb_end_write(elr->lr_super);
3841					mutex_lock(&eli->li_list_mtx);
3842					n = pos->next;
3843				}
3844				up_read((&elr->lr_super->s_umount));
3845			}
3846			/* error, remove the lazy_init job */
3847			if (err) {
3848				ext4_remove_li_request(elr);
3849				continue;
3850			}
3851			if (!progress) {
3852				elr->lr_next_sched = jiffies +
3853					get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
3854			}
3855			if (time_before(elr->lr_next_sched, next_wakeup))
3856				next_wakeup = elr->lr_next_sched;
3857		}
3858		mutex_unlock(&eli->li_list_mtx);
3859
3860		try_to_freeze();
3861
3862		cur = jiffies;
3863		if ((time_after_eq(cur, next_wakeup)) ||
3864		    (MAX_JIFFY_OFFSET == next_wakeup)) {
3865			cond_resched();
3866			continue;
3867		}
3868
3869		schedule_timeout_interruptible(next_wakeup - cur);
3870
3871		if (kthread_should_stop()) {
3872			ext4_clear_request_list();
3873			goto exit_thread;
3874		}
3875	}
3876
3877exit_thread:
3878	/*
3879	 * It looks like the request list is empty, but we need
3880	 * to check it under the li_list_mtx lock, to prevent any
3881	 * additions into it, and of course we should lock ext4_li_mtx
3882	 * to atomically free the list and ext4_li_info, because at
3883	 * this point another ext4 filesystem could be registering
3884	 * new one.
3885	 */
3886	mutex_lock(&ext4_li_mtx);
3887	mutex_lock(&eli->li_list_mtx);
3888	if (!list_empty(&eli->li_request_list)) {
3889		mutex_unlock(&eli->li_list_mtx);
3890		mutex_unlock(&ext4_li_mtx);
3891		goto cont_thread;
3892	}
3893	mutex_unlock(&eli->li_list_mtx);
3894	kfree(ext4_li_info);
3895	ext4_li_info = NULL;
3896	mutex_unlock(&ext4_li_mtx);
3897
3898	return 0;
3899}
3900
3901static void ext4_clear_request_list(void)
3902{
3903	struct list_head *pos, *n;
3904	struct ext4_li_request *elr;
3905
3906	mutex_lock(&ext4_li_info->li_list_mtx);
3907	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3908		elr = list_entry(pos, struct ext4_li_request,
3909				 lr_request);
3910		ext4_remove_li_request(elr);
3911	}
3912	mutex_unlock(&ext4_li_info->li_list_mtx);
3913}
3914
3915static int ext4_run_lazyinit_thread(void)
3916{
3917	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3918					 ext4_li_info, "ext4lazyinit");
3919	if (IS_ERR(ext4_lazyinit_task)) {
3920		int err = PTR_ERR(ext4_lazyinit_task);
3921		ext4_clear_request_list();
3922		kfree(ext4_li_info);
3923		ext4_li_info = NULL;
3924		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3925				 "initialization thread\n",
3926				 err);
3927		return err;
3928	}
3929	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3930	return 0;
3931}
3932
3933/*
3934 * Check whether it make sense to run itable init. thread or not.
3935 * If there is at least one uninitialized inode table, return
3936 * corresponding group number, else the loop goes through all
3937 * groups and return total number of groups.
3938 */
3939static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3940{
3941	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3942	struct ext4_group_desc *gdp = NULL;
3943
3944	if (!ext4_has_group_desc_csum(sb))
3945		return ngroups;
3946
3947	for (group = 0; group < ngroups; group++) {
3948		gdp = ext4_get_group_desc(sb, group, NULL);
3949		if (!gdp)
3950			continue;
3951
3952		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3953			break;
3954	}
3955
3956	return group;
3957}
3958
3959static int ext4_li_info_new(void)
3960{
3961	struct ext4_lazy_init *eli = NULL;
3962
3963	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3964	if (!eli)
3965		return -ENOMEM;
3966
3967	INIT_LIST_HEAD(&eli->li_request_list);
3968	mutex_init(&eli->li_list_mtx);
3969
3970	eli->li_state |= EXT4_LAZYINIT_QUIT;
3971
3972	ext4_li_info = eli;
3973
3974	return 0;
3975}
3976
3977static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3978					    ext4_group_t start)
3979{
3980	struct ext4_li_request *elr;
3981
3982	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3983	if (!elr)
3984		return NULL;
3985
3986	elr->lr_super = sb;
3987	elr->lr_first_not_zeroed = start;
3988	if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
3989		elr->lr_mode = EXT4_LI_MODE_ITABLE;
3990		elr->lr_next_group = start;
3991	} else {
3992		elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
3993	}
3994
3995	/*
3996	 * Randomize first schedule time of the request to
3997	 * spread the inode table initialization requests
3998	 * better.
3999	 */
4000	elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
4001	return elr;
4002}
4003
4004int ext4_register_li_request(struct super_block *sb,
4005			     ext4_group_t first_not_zeroed)
4006{
4007	struct ext4_sb_info *sbi = EXT4_SB(sb);
4008	struct ext4_li_request *elr = NULL;
4009	ext4_group_t ngroups = sbi->s_groups_count;
4010	int ret = 0;
4011
4012	mutex_lock(&ext4_li_mtx);
4013	if (sbi->s_li_request != NULL) {
4014		/*
4015		 * Reset timeout so it can be computed again, because
4016		 * s_li_wait_mult might have changed.
4017		 */
4018		sbi->s_li_request->lr_timeout = 0;
4019		goto out;
4020	}
4021
4022	if (sb_rdonly(sb) ||
4023	    (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
4024	     (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
4025		goto out;
4026
4027	elr = ext4_li_request_new(sb, first_not_zeroed);
4028	if (!elr) {
4029		ret = -ENOMEM;
4030		goto out;
4031	}
4032
4033	if (NULL == ext4_li_info) {
4034		ret = ext4_li_info_new();
4035		if (ret)
4036			goto out;
4037	}
4038
4039	mutex_lock(&ext4_li_info->li_list_mtx);
4040	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
4041	mutex_unlock(&ext4_li_info->li_list_mtx);
4042
4043	sbi->s_li_request = elr;
4044	/*
4045	 * set elr to NULL here since it has been inserted to
4046	 * the request_list and the removal and free of it is
4047	 * handled by ext4_clear_request_list from now on.
4048	 */
4049	elr = NULL;
4050
4051	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
4052		ret = ext4_run_lazyinit_thread();
4053		if (ret)
4054			goto out;
4055	}
4056out:
4057	mutex_unlock(&ext4_li_mtx);
4058	if (ret)
4059		kfree(elr);
4060	return ret;
4061}
4062
4063/*
4064 * We do not need to lock anything since this is called on
4065 * module unload.
4066 */
4067static void ext4_destroy_lazyinit_thread(void)
4068{
4069	/*
4070	 * If thread exited earlier
4071	 * there's nothing to be done.
4072	 */
4073	if (!ext4_li_info || !ext4_lazyinit_task)
4074		return;
4075
4076	kthread_stop(ext4_lazyinit_task);
4077}
4078
4079static int set_journal_csum_feature_set(struct super_block *sb)
4080{
4081	int ret = 1;
4082	int compat, incompat;
4083	struct ext4_sb_info *sbi = EXT4_SB(sb);
4084
4085	if (ext4_has_metadata_csum(sb)) {
4086		/* journal checksum v3 */
4087		compat = 0;
4088		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
4089	} else {
4090		/* journal checksum v1 */
4091		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
4092		incompat = 0;
4093	}
4094
4095	jbd2_journal_clear_features(sbi->s_journal,
4096			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
4097			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
4098			JBD2_FEATURE_INCOMPAT_CSUM_V2);
4099	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4100		ret = jbd2_journal_set_features(sbi->s_journal,
4101				compat, 0,
4102				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
4103				incompat);
4104	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
4105		ret = jbd2_journal_set_features(sbi->s_journal,
4106				compat, 0,
4107				incompat);
4108		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
4109				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
4110	} else {
4111		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
4112				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
4113	}
4114
4115	return ret;
4116}
4117
4118/*
4119 * Note: calculating the overhead so we can be compatible with
4120 * historical BSD practice is quite difficult in the face of
4121 * clusters/bigalloc.  This is because multiple metadata blocks from
4122 * different block group can end up in the same allocation cluster.
4123 * Calculating the exact overhead in the face of clustered allocation
4124 * requires either O(all block bitmaps) in memory or O(number of block
4125 * groups**2) in time.  We will still calculate the superblock for
4126 * older file systems --- and if we come across with a bigalloc file
4127 * system with zero in s_overhead_clusters the estimate will be close to
4128 * correct especially for very large cluster sizes --- but for newer
4129 * file systems, it's better to calculate this figure once at mkfs
4130 * time, and store it in the superblock.  If the superblock value is
4131 * present (even for non-bigalloc file systems), we will use it.
4132 */
4133static int count_overhead(struct super_block *sb, ext4_group_t grp,
4134			  char *buf)
4135{
4136	struct ext4_sb_info	*sbi = EXT4_SB(sb);
4137	struct ext4_group_desc	*gdp;
4138	ext4_fsblk_t		first_block, last_block, b;
4139	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
4140	int			s, j, count = 0;
4141	int			has_super = ext4_bg_has_super(sb, grp);
4142
4143	if (!ext4_has_feature_bigalloc(sb))
4144		return (has_super + ext4_bg_num_gdb(sb, grp) +
4145			(has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
4146			sbi->s_itb_per_group + 2);
4147
4148	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
4149		(grp * EXT4_BLOCKS_PER_GROUP(sb));
4150	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
4151	for (i = 0; i < ngroups; i++) {
4152		gdp = ext4_get_group_desc(sb, i, NULL);
4153		b = ext4_block_bitmap(sb, gdp);
4154		if (b >= first_block && b <= last_block) {
4155			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
4156			count++;
4157		}
4158		b = ext4_inode_bitmap(sb, gdp);
4159		if (b >= first_block && b <= last_block) {
4160			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
4161			count++;
4162		}
4163		b = ext4_inode_table(sb, gdp);
4164		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
4165			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
4166				int c = EXT4_B2C(sbi, b - first_block);
4167				ext4_set_bit(c, buf);
4168				count++;
4169			}
4170		if (i != grp)
4171			continue;
4172		s = 0;
4173		if (ext4_bg_has_super(sb, grp)) {
4174			ext4_set_bit(s++, buf);
4175			count++;
4176		}
4177		j = ext4_bg_num_gdb(sb, grp);
4178		if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
4179			ext4_error(sb, "Invalid number of block group "
4180				   "descriptor blocks: %d", j);
4181			j = EXT4_BLOCKS_PER_GROUP(sb) - s;
4182		}
4183		count += j;
4184		for (; j > 0; j--)
4185			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
4186	}
4187	if (!count)
4188		return 0;
4189	return EXT4_CLUSTERS_PER_GROUP(sb) -
4190		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
4191}
4192
4193/*
4194 * Compute the overhead and stash it in sbi->s_overhead
4195 */
4196int ext4_calculate_overhead(struct super_block *sb)
4197{
4198	struct ext4_sb_info *sbi = EXT4_SB(sb);
4199	struct ext4_super_block *es = sbi->s_es;
4200	struct inode *j_inode;
4201	unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
4202	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4203	ext4_fsblk_t overhead = 0;
4204	char *buf = (char *) get_zeroed_page(GFP_NOFS);
4205
4206	if (!buf)
4207		return -ENOMEM;
4208
4209	/*
4210	 * Compute the overhead (FS structures).  This is constant
4211	 * for a given filesystem unless the number of block groups
4212	 * changes so we cache the previous value until it does.
4213	 */
4214
4215	/*
4216	 * All of the blocks before first_data_block are overhead
4217	 */
4218	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
4219
4220	/*
4221	 * Add the overhead found in each block group
4222	 */
4223	for (i = 0; i < ngroups; i++) {
4224		int blks;
4225
4226		blks = count_overhead(sb, i, buf);
4227		overhead += blks;
4228		if (blks)
4229			memset(buf, 0, PAGE_SIZE);
4230		cond_resched();
4231	}
4232
4233	/*
4234	 * Add the internal journal blocks whether the journal has been
4235	 * loaded or not
4236	 */
4237	if (sbi->s_journal && !sbi->s_journal_bdev)
4238		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
4239	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
4240		/* j_inum for internal journal is non-zero */
4241		j_inode = ext4_get_journal_inode(sb, j_inum);
4242		if (!IS_ERR(j_inode)) {
4243			j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
4244			overhead += EXT4_NUM_B2C(sbi, j_blocks);
4245			iput(j_inode);
4246		} else {
4247			ext4_msg(sb, KERN_ERR, "can't get journal size");
4248		}
4249	}
4250	sbi->s_overhead = overhead;
4251	smp_wmb();
4252	free_page((unsigned long) buf);
4253	return 0;
4254}
4255
4256static void ext4_set_resv_clusters(struct super_block *sb)
4257{
4258	ext4_fsblk_t resv_clusters;
4259	struct ext4_sb_info *sbi = EXT4_SB(sb);
4260
4261	/*
4262	 * There's no need to reserve anything when we aren't using extents.
4263	 * The space estimates are exact, there are no unwritten extents,
4264	 * hole punching doesn't need new metadata... This is needed especially
4265	 * to keep ext2/3 backward compatibility.
4266	 */
4267	if (!ext4_has_feature_extents(sb))
4268		return;
4269	/*
4270	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
4271	 * This should cover the situations where we can not afford to run
4272	 * out of space like for example punch hole, or converting
4273	 * unwritten extents in delalloc path. In most cases such
4274	 * allocation would require 1, or 2 blocks, higher numbers are
4275	 * very rare.
4276	 */
4277	resv_clusters = (ext4_blocks_count(sbi->s_es) >>
4278			 sbi->s_cluster_bits);
4279
4280	do_div(resv_clusters, 50);
4281	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
4282
4283	atomic64_set(&sbi->s_resv_clusters, resv_clusters);
4284}
4285
4286static const char *ext4_quota_mode(struct super_block *sb)
4287{
4288#ifdef CONFIG_QUOTA
4289	if (!ext4_quota_capable(sb))
4290		return "none";
4291
4292	if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
4293		return "journalled";
4294	else
4295		return "writeback";
4296#else
4297	return "disabled";
4298#endif
4299}
4300
4301static void ext4_setup_csum_trigger(struct super_block *sb,
4302				    enum ext4_journal_trigger_type type,
4303				    void (*trigger)(
4304					struct jbd2_buffer_trigger_type *type,
4305					struct buffer_head *bh,
4306					void *mapped_data,
4307					size_t size))
4308{
4309	struct ext4_sb_info *sbi = EXT4_SB(sb);
4310
4311	sbi->s_journal_triggers[type].sb = sb;
4312	sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
4313}
4314
4315static void ext4_free_sbi(struct ext4_sb_info *sbi)
4316{
4317	if (!sbi)
4318		return;
4319
4320	kfree(sbi->s_blockgroup_lock);
4321	fs_put_dax(sbi->s_daxdev, NULL);
4322	kfree(sbi);
4323}
4324
4325static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
4326{
4327	struct ext4_sb_info *sbi;
4328
4329	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
4330	if (!sbi)
4331		return NULL;
4332
4333	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
4334					   NULL, NULL);
4335
4336	sbi->s_blockgroup_lock =
4337		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
4338
4339	if (!sbi->s_blockgroup_lock)
4340		goto err_out;
4341
4342	sb->s_fs_info = sbi;
4343	sbi->s_sb = sb;
4344	return sbi;
4345err_out:
4346	fs_put_dax(sbi->s_daxdev, NULL);
4347	kfree(sbi);
4348	return NULL;
4349}
4350
4351static void ext4_set_def_opts(struct super_block *sb,
4352			      struct ext4_super_block *es)
4353{
4354	unsigned long def_mount_opts;
4355
4356	/* Set defaults before we parse the mount options */
4357	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
4358	set_opt(sb, INIT_INODE_TABLE);
4359	if (def_mount_opts & EXT4_DEFM_DEBUG)
4360		set_opt(sb, DEBUG);
4361	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
4362		set_opt(sb, GRPID);
4363	if (def_mount_opts & EXT4_DEFM_UID16)
4364		set_opt(sb, NO_UID32);
4365	/* xattr user namespace & acls are now defaulted on */
4366	set_opt(sb, XATTR_USER);
4367#ifdef CONFIG_EXT4_FS_POSIX_ACL
4368	set_opt(sb, POSIX_ACL);
4369#endif
4370	if (ext4_has_feature_fast_commit(sb))
4371		set_opt2(sb, JOURNAL_FAST_COMMIT);
4372	/* don't forget to enable journal_csum when metadata_csum is enabled. */
4373	if (ext4_has_metadata_csum(sb))
4374		set_opt(sb, JOURNAL_CHECKSUM);
4375
4376	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
4377		set_opt(sb, JOURNAL_DATA);
4378	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
4379		set_opt(sb, ORDERED_DATA);
4380	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
4381		set_opt(sb, WRITEBACK_DATA);
4382
4383	if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
4384		set_opt(sb, ERRORS_PANIC);
4385	else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
4386		set_opt(sb, ERRORS_CONT);
4387	else
4388		set_opt(sb, ERRORS_RO);
4389	/* block_validity enabled by default; disable with noblock_validity */
4390	set_opt(sb, BLOCK_VALIDITY);
4391	if (def_mount_opts & EXT4_DEFM_DISCARD)
4392		set_opt(sb, DISCARD);
4393
4394	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
4395		set_opt(sb, BARRIER);
4396
4397	/*
4398	 * enable delayed allocation by default
4399	 * Use -o nodelalloc to turn it off
4400	 */
4401	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
4402	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
4403		set_opt(sb, DELALLOC);
4404
4405	if (sb->s_blocksize == PAGE_SIZE)
4406		set_opt(sb, DIOREAD_NOLOCK);
4407}
4408
4409static int ext4_handle_clustersize(struct super_block *sb)
4410{
4411	struct ext4_sb_info *sbi = EXT4_SB(sb);
4412	struct ext4_super_block *es = sbi->s_es;
4413	int clustersize;
4414
4415	/* Handle clustersize */
4416	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4417	if (ext4_has_feature_bigalloc(sb)) {
4418		if (clustersize < sb->s_blocksize) {
4419			ext4_msg(sb, KERN_ERR,
4420				 "cluster size (%d) smaller than "
4421				 "block size (%lu)", clustersize, sb->s_blocksize);
4422			return -EINVAL;
4423		}
4424		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4425			le32_to_cpu(es->s_log_block_size);
4426		sbi->s_clusters_per_group =
4427			le32_to_cpu(es->s_clusters_per_group);
4428		if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
4429			ext4_msg(sb, KERN_ERR,
4430				 "#clusters per group too big: %lu",
4431				 sbi->s_clusters_per_group);
4432			return -EINVAL;
4433		}
4434		if (sbi->s_blocks_per_group !=
4435		    (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
4436			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
4437				 "clusters per group (%lu) inconsistent",
4438				 sbi->s_blocks_per_group,
4439				 sbi->s_clusters_per_group);
4440			return -EINVAL;
4441		}
4442	} else {
4443		if (clustersize != sb->s_blocksize) {
4444			ext4_msg(sb, KERN_ERR,
4445				 "fragment/cluster size (%d) != "
4446				 "block size (%lu)", clustersize, sb->s_blocksize);
4447			return -EINVAL;
4448		}
4449		if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
4450			ext4_msg(sb, KERN_ERR,
4451				 "#blocks per group too big: %lu",
4452				 sbi->s_blocks_per_group);
4453			return -EINVAL;
4454		}
4455		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
4456		sbi->s_cluster_bits = 0;
4457	}
4458	sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
4459
4460	/* Do we have standard group size of clustersize * 8 blocks ? */
4461	if (sbi->s_blocks_per_group == clustersize << 3)
4462		set_opt2(sb, STD_GROUP_SIZE);
4463
4464	return 0;
4465}
4466
4467static void ext4_fast_commit_init(struct super_block *sb)
4468{
4469	struct ext4_sb_info *sbi = EXT4_SB(sb);
4470
4471	/* Initialize fast commit stuff */
4472	atomic_set(&sbi->s_fc_subtid, 0);
4473	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
4474	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
4475	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
4476	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
4477	sbi->s_fc_bytes = 0;
4478	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
4479	sbi->s_fc_ineligible_tid = 0;
4480	spin_lock_init(&sbi->s_fc_lock);
4481	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
4482	sbi->s_fc_replay_state.fc_regions = NULL;
4483	sbi->s_fc_replay_state.fc_regions_size = 0;
4484	sbi->s_fc_replay_state.fc_regions_used = 0;
4485	sbi->s_fc_replay_state.fc_regions_valid = 0;
4486	sbi->s_fc_replay_state.fc_modified_inodes = NULL;
4487	sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
4488	sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
4489}
4490
4491static int ext4_inode_info_init(struct super_block *sb,
4492				struct ext4_super_block *es)
4493{
4494	struct ext4_sb_info *sbi = EXT4_SB(sb);
4495
4496	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
4497		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
4498		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
4499	} else {
4500		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
4501		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
4502		if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
4503			ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
4504				 sbi->s_first_ino);
4505			return -EINVAL;
4506		}
4507		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
4508		    (!is_power_of_2(sbi->s_inode_size)) ||
4509		    (sbi->s_inode_size > sb->s_blocksize)) {
4510			ext4_msg(sb, KERN_ERR,
4511			       "unsupported inode size: %d",
4512			       sbi->s_inode_size);
4513			ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
4514			return -EINVAL;
4515		}
4516		/*
4517		 * i_atime_extra is the last extra field available for
4518		 * [acm]times in struct ext4_inode. Checking for that
4519		 * field should suffice to ensure we have extra space
4520		 * for all three.
4521		 */
4522		if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
4523			sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
4524			sb->s_time_gran = 1;
4525			sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
4526		} else {
4527			sb->s_time_gran = NSEC_PER_SEC;
4528			sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
4529		}
4530		sb->s_time_min = EXT4_TIMESTAMP_MIN;
4531	}
4532
4533	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4534		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4535			EXT4_GOOD_OLD_INODE_SIZE;
4536		if (ext4_has_feature_extra_isize(sb)) {
4537			unsigned v, max = (sbi->s_inode_size -
4538					   EXT4_GOOD_OLD_INODE_SIZE);
4539
4540			v = le16_to_cpu(es->s_want_extra_isize);
4541			if (v > max) {
4542				ext4_msg(sb, KERN_ERR,
4543					 "bad s_want_extra_isize: %d", v);
4544				return -EINVAL;
4545			}
4546			if (sbi->s_want_extra_isize < v)
4547				sbi->s_want_extra_isize = v;
4548
4549			v = le16_to_cpu(es->s_min_extra_isize);
4550			if (v > max) {
4551				ext4_msg(sb, KERN_ERR,
4552					 "bad s_min_extra_isize: %d", v);
4553				return -EINVAL;
4554			}
4555			if (sbi->s_want_extra_isize < v)
4556				sbi->s_want_extra_isize = v;
4557		}
4558	}
4559
4560	return 0;
4561}
4562
4563#if IS_ENABLED(CONFIG_UNICODE)
4564static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
4565{
4566	const struct ext4_sb_encodings *encoding_info;
4567	struct unicode_map *encoding;
4568	__u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
4569
4570	if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
4571		return 0;
4572
4573	encoding_info = ext4_sb_read_encoding(es);
4574	if (!encoding_info) {
4575		ext4_msg(sb, KERN_ERR,
4576			"Encoding requested by superblock is unknown");
4577		return -EINVAL;
4578	}
4579
4580	encoding = utf8_load(encoding_info->version);
4581	if (IS_ERR(encoding)) {
4582		ext4_msg(sb, KERN_ERR,
4583			"can't mount with superblock charset: %s-%u.%u.%u "
4584			"not supported by the kernel. flags: 0x%x.",
4585			encoding_info->name,
4586			unicode_major(encoding_info->version),
4587			unicode_minor(encoding_info->version),
4588			unicode_rev(encoding_info->version),
4589			encoding_flags);
4590		return -EINVAL;
4591	}
4592	ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
4593		"%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
4594		unicode_major(encoding_info->version),
4595		unicode_minor(encoding_info->version),
4596		unicode_rev(encoding_info->version),
4597		encoding_flags);
4598
4599	sb->s_encoding = encoding;
4600	sb->s_encoding_flags = encoding_flags;
4601
4602	return 0;
4603}
4604#else
4605static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
4606{
4607	return 0;
4608}
4609#endif
4610
4611static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
4612{
4613	struct ext4_sb_info *sbi = EXT4_SB(sb);
4614
4615	/* Warn if metadata_csum and gdt_csum are both set. */
4616	if (ext4_has_feature_metadata_csum(sb) &&
4617	    ext4_has_feature_gdt_csum(sb))
4618		ext4_warning(sb, "metadata_csum and uninit_bg are "
4619			     "redundant flags; please run fsck.");
4620
4621	/* Check for a known checksum algorithm */
4622	if (!ext4_verify_csum_type(sb, es)) {
4623		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4624			 "unknown checksum algorithm.");
4625		return -EINVAL;
4626	}
4627	ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
4628				ext4_orphan_file_block_trigger);
4629
4630	/* Load the checksum driver */
4631	sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
4632	if (IS_ERR(sbi->s_chksum_driver)) {
4633		int ret = PTR_ERR(sbi->s_chksum_driver);
4634		ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
4635		sbi->s_chksum_driver = NULL;
4636		return ret;
4637	}
4638
4639	/* Check superblock checksum */
4640	if (!ext4_superblock_csum_verify(sb, es)) {
4641		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4642			 "invalid superblock checksum.  Run e2fsck?");
4643		return -EFSBADCRC;
4644	}
4645
4646	/* Precompute checksum seed for all metadata */
4647	if (ext4_has_feature_csum_seed(sb))
4648		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
4649	else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
4650		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
4651					       sizeof(es->s_uuid));
4652	return 0;
4653}
4654
4655static int ext4_check_feature_compatibility(struct super_block *sb,
4656					    struct ext4_super_block *es,
4657					    int silent)
4658{
4659	struct ext4_sb_info *sbi = EXT4_SB(sb);
4660
4661	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
4662	    (ext4_has_compat_features(sb) ||
4663	     ext4_has_ro_compat_features(sb) ||
4664	     ext4_has_incompat_features(sb)))
4665		ext4_msg(sb, KERN_WARNING,
4666		       "feature flags set on rev 0 fs, "
4667		       "running e2fsck is recommended");
4668
4669	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
4670		set_opt2(sb, HURD_COMPAT);
4671		if (ext4_has_feature_64bit(sb)) {
4672			ext4_msg(sb, KERN_ERR,
4673				 "The Hurd can't support 64-bit file systems");
4674			return -EINVAL;
4675		}
4676
4677		/*
4678		 * ea_inode feature uses l_i_version field which is not
4679		 * available in HURD_COMPAT mode.
4680		 */
4681		if (ext4_has_feature_ea_inode(sb)) {
4682			ext4_msg(sb, KERN_ERR,
4683				 "ea_inode feature is not supported for Hurd");
4684			return -EINVAL;
4685		}
4686	}
4687
4688	if (IS_EXT2_SB(sb)) {
4689		if (ext2_feature_set_ok(sb))
4690			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
4691				 "using the ext4 subsystem");
4692		else {
4693			/*
4694			 * If we're probing be silent, if this looks like
4695			 * it's actually an ext[34] filesystem.
4696			 */
4697			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4698				return -EINVAL;
4699			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
4700				 "to feature incompatibilities");
4701			return -EINVAL;
4702		}
4703	}
4704
4705	if (IS_EXT3_SB(sb)) {
4706		if (ext3_feature_set_ok(sb))
4707			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
4708				 "using the ext4 subsystem");
4709		else {
4710			/*
4711			 * If we're probing be silent, if this looks like
4712			 * it's actually an ext4 filesystem.
4713			 */
4714			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4715				return -EINVAL;
4716			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
4717				 "to feature incompatibilities");
4718			return -EINVAL;
4719		}
4720	}
4721
4722	/*
4723	 * Check feature flags regardless of the revision level, since we
4724	 * previously didn't change the revision level when setting the flags,
4725	 * so there is a chance incompat flags are set on a rev 0 filesystem.
4726	 */
4727	if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
4728		return -EINVAL;
4729
4730	if (sbi->s_daxdev) {
4731		if (sb->s_blocksize == PAGE_SIZE)
4732			set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
4733		else
4734			ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
4735	}
4736
4737	if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
4738		if (ext4_has_feature_inline_data(sb)) {
4739			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
4740					" that may contain inline data");
4741			return -EINVAL;
4742		}
4743		if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
4744			ext4_msg(sb, KERN_ERR,
4745				"DAX unsupported by block device.");
4746			return -EINVAL;
4747		}
4748	}
4749
4750	if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
4751		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
4752			 es->s_encryption_level);
4753		return -EINVAL;
4754	}
4755
4756	return 0;
4757}
4758
4759static int ext4_check_geometry(struct super_block *sb,
4760			       struct ext4_super_block *es)
4761{
4762	struct ext4_sb_info *sbi = EXT4_SB(sb);
4763	__u64 blocks_count;
4764	int err;
4765
4766	if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
4767		ext4_msg(sb, KERN_ERR,
4768			 "Number of reserved GDT blocks insanely large: %d",
4769			 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
4770		return -EINVAL;
4771	}
4772	/*
4773	 * Test whether we have more sectors than will fit in sector_t,
4774	 * and whether the max offset is addressable by the page cache.
4775	 */
4776	err = generic_check_addressable(sb->s_blocksize_bits,
4777					ext4_blocks_count(es));
4778	if (err) {
4779		ext4_msg(sb, KERN_ERR, "filesystem"
4780			 " too large to mount safely on this system");
4781		return err;
4782	}
4783
4784	/* check blocks count against device size */
4785	blocks_count = sb_bdev_nr_blocks(sb);
4786	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4787		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4788		       "exceeds size of device (%llu blocks)",
4789		       ext4_blocks_count(es), blocks_count);
4790		return -EINVAL;
4791	}
4792
4793	/*
4794	 * It makes no sense for the first data block to be beyond the end
4795	 * of the filesystem.
4796	 */
4797	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4798		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4799			 "block %u is beyond end of filesystem (%llu)",
4800			 le32_to_cpu(es->s_first_data_block),
4801			 ext4_blocks_count(es));
4802		return -EINVAL;
4803	}
4804	if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4805	    (sbi->s_cluster_ratio == 1)) {
4806		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4807			 "block is 0 with a 1k block and cluster size");
4808		return -EINVAL;
4809	}
4810
4811	blocks_count = (ext4_blocks_count(es) -
4812			le32_to_cpu(es->s_first_data_block) +
4813			EXT4_BLOCKS_PER_GROUP(sb) - 1);
4814	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4815	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4816		ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
4817		       "(block count %llu, first data block %u, "
4818		       "blocks per group %lu)", blocks_count,
4819		       ext4_blocks_count(es),
4820		       le32_to_cpu(es->s_first_data_block),
4821		       EXT4_BLOCKS_PER_GROUP(sb));
4822		return -EINVAL;
4823	}
4824	sbi->s_groups_count = blocks_count;
4825	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4826			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4827	if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4828	    le32_to_cpu(es->s_inodes_count)) {
4829		ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4830			 le32_to_cpu(es->s_inodes_count),
4831			 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4832		return -EINVAL;
4833	}
4834
4835	return 0;
4836}
4837
4838static int ext4_group_desc_init(struct super_block *sb,
4839				struct ext4_super_block *es,
4840				ext4_fsblk_t logical_sb_block,
4841				ext4_group_t *first_not_zeroed)
4842{
4843	struct ext4_sb_info *sbi = EXT4_SB(sb);
4844	unsigned int db_count;
4845	ext4_fsblk_t block;
4846	int i;
4847
4848	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4849		   EXT4_DESC_PER_BLOCK(sb);
4850	if (ext4_has_feature_meta_bg(sb)) {
4851		if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4852			ext4_msg(sb, KERN_WARNING,
4853				 "first meta block group too large: %u "
4854				 "(group descriptor block count %u)",
4855				 le32_to_cpu(es->s_first_meta_bg), db_count);
4856			return -EINVAL;
4857		}
4858	}
4859	rcu_assign_pointer(sbi->s_group_desc,
4860			   kvmalloc_array(db_count,
4861					  sizeof(struct buffer_head *),
4862					  GFP_KERNEL));
4863	if (sbi->s_group_desc == NULL) {
4864		ext4_msg(sb, KERN_ERR, "not enough memory");
4865		return -ENOMEM;
4866	}
4867
4868	bgl_lock_init(sbi->s_blockgroup_lock);
4869
4870	/* Pre-read the descriptors into the buffer cache */
4871	for (i = 0; i < db_count; i++) {
4872		block = descriptor_loc(sb, logical_sb_block, i);
4873		ext4_sb_breadahead_unmovable(sb, block);
4874	}
4875
4876	for (i = 0; i < db_count; i++) {
4877		struct buffer_head *bh;
4878
4879		block = descriptor_loc(sb, logical_sb_block, i);
4880		bh = ext4_sb_bread_unmovable(sb, block);
4881		if (IS_ERR(bh)) {
4882			ext4_msg(sb, KERN_ERR,
4883			       "can't read group descriptor %d", i);
4884			sbi->s_gdb_count = i;
4885			return PTR_ERR(bh);
4886		}
4887		rcu_read_lock();
4888		rcu_dereference(sbi->s_group_desc)[i] = bh;
4889		rcu_read_unlock();
4890	}
4891	sbi->s_gdb_count = db_count;
4892	if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
4893		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4894		return -EFSCORRUPTED;
4895	}
4896
4897	return 0;
4898}
4899
4900static int ext4_load_and_init_journal(struct super_block *sb,
4901				      struct ext4_super_block *es,
4902				      struct ext4_fs_context *ctx)
4903{
4904	struct ext4_sb_info *sbi = EXT4_SB(sb);
4905	int err;
4906
4907	err = ext4_load_journal(sb, es, ctx->journal_devnum);
4908	if (err)
4909		return err;
4910
4911	if (ext4_has_feature_64bit(sb) &&
4912	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4913				       JBD2_FEATURE_INCOMPAT_64BIT)) {
4914		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4915		goto out;
4916	}
4917
4918	if (!set_journal_csum_feature_set(sb)) {
4919		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4920			 "feature set");
4921		goto out;
4922	}
4923
4924	if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
4925		!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4926					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
4927		ext4_msg(sb, KERN_ERR,
4928			"Failed to set fast commit journal feature");
4929		goto out;
4930	}
4931
4932	/* We have now updated the journal if required, so we can
4933	 * validate the data journaling mode. */
4934	switch (test_opt(sb, DATA_FLAGS)) {
4935	case 0:
4936		/* No mode set, assume a default based on the journal
4937		 * capabilities: ORDERED_DATA if the journal can
4938		 * cope, else JOURNAL_DATA
4939		 */
4940		if (jbd2_journal_check_available_features
4941		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4942			set_opt(sb, ORDERED_DATA);
4943			sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4944		} else {
4945			set_opt(sb, JOURNAL_DATA);
4946			sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4947		}
4948		break;
4949
4950	case EXT4_MOUNT_ORDERED_DATA:
4951	case EXT4_MOUNT_WRITEBACK_DATA:
4952		if (!jbd2_journal_check_available_features
4953		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4954			ext4_msg(sb, KERN_ERR, "Journal does not support "
4955			       "requested data journaling mode");
4956			goto out;
4957		}
4958		break;
4959	default:
4960		break;
4961	}
4962
4963	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
4964	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4965		ext4_msg(sb, KERN_ERR, "can't mount with "
4966			"journal_async_commit in data=ordered mode");
4967		goto out;
4968	}
4969
4970	set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
4971
4972	sbi->s_journal->j_submit_inode_data_buffers =
4973		ext4_journal_submit_inode_data_buffers;
4974	sbi->s_journal->j_finish_inode_data_buffers =
4975		ext4_journal_finish_inode_data_buffers;
4976
4977	return 0;
4978
4979out:
4980	/* flush s_sb_upd_work before destroying the journal. */
4981	flush_work(&sbi->s_sb_upd_work);
4982	jbd2_journal_destroy(sbi->s_journal);
4983	sbi->s_journal = NULL;
4984	return -EINVAL;
4985}
4986
4987static int ext4_check_journal_data_mode(struct super_block *sb)
4988{
4989	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4990		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
4991			    "data=journal disables delayed allocation, "
4992			    "dioread_nolock, O_DIRECT and fast_commit support!\n");
4993		/* can't mount with both data=journal and dioread_nolock. */
4994		clear_opt(sb, DIOREAD_NOLOCK);
4995		clear_opt2(sb, JOURNAL_FAST_COMMIT);
4996		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4997			ext4_msg(sb, KERN_ERR, "can't mount with "
4998				 "both data=journal and delalloc");
4999			return -EINVAL;
5000		}
5001		if (test_opt(sb, DAX_ALWAYS)) {
5002			ext4_msg(sb, KERN_ERR, "can't mount with "
5003				 "both data=journal and dax");
5004			return -EINVAL;
5005		}
5006		if (ext4_has_feature_encrypt(sb)) {
5007			ext4_msg(sb, KERN_WARNING,
5008				 "encrypted files will use data=ordered "
5009				 "instead of data journaling mode");
5010		}
5011		if (test_opt(sb, DELALLOC))
5012			clear_opt(sb, DELALLOC);
5013	} else {
5014		sb->s_iflags |= SB_I_CGROUPWB;
5015	}
5016
5017	return 0;
5018}
5019
5020static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
5021			   int silent)
5022{
5023	struct ext4_sb_info *sbi = EXT4_SB(sb);
5024	struct ext4_super_block *es;
5025	ext4_fsblk_t logical_sb_block;
5026	unsigned long offset = 0;
5027	struct buffer_head *bh;
5028	int ret = -EINVAL;
5029	int blocksize;
5030
5031	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
5032	if (!blocksize) {
5033		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
5034		return -EINVAL;
5035	}
5036
5037	/*
5038	 * The ext4 superblock will not be buffer aligned for other than 1kB
5039	 * block sizes.  We need to calculate the offset from buffer start.
5040	 */
5041	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
5042		logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
5043		offset = do_div(logical_sb_block, blocksize);
5044	} else {
5045		logical_sb_block = sbi->s_sb_block;
5046	}
5047
5048	bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
5049	if (IS_ERR(bh)) {
5050		ext4_msg(sb, KERN_ERR, "unable to read superblock");
5051		return PTR_ERR(bh);
5052	}
5053	/*
5054	 * Note: s_es must be initialized as soon as possible because
5055	 *       some ext4 macro-instructions depend on its value
5056	 */
5057	es = (struct ext4_super_block *) (bh->b_data + offset);
5058	sbi->s_es = es;
5059	sb->s_magic = le16_to_cpu(es->s_magic);
5060	if (sb->s_magic != EXT4_SUPER_MAGIC) {
5061		if (!silent)
5062			ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5063		goto out;
5064	}
5065
5066	if (le32_to_cpu(es->s_log_block_size) >
5067	    (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
5068		ext4_msg(sb, KERN_ERR,
5069			 "Invalid log block size: %u",
5070			 le32_to_cpu(es->s_log_block_size));
5071		goto out;
5072	}
5073	if (le32_to_cpu(es->s_log_cluster_size) >
5074	    (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
5075		ext4_msg(sb, KERN_ERR,
5076			 "Invalid log cluster size: %u",
5077			 le32_to_cpu(es->s_log_cluster_size));
5078		goto out;
5079	}
5080
5081	blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
5082
5083	/*
5084	 * If the default block size is not the same as the real block size,
5085	 * we need to reload it.
5086	 */
5087	if (sb->s_blocksize == blocksize) {
5088		*lsb = logical_sb_block;
5089		sbi->s_sbh = bh;
5090		return 0;
5091	}
5092
5093	/*
5094	 * bh must be released before kill_bdev(), otherwise
5095	 * it won't be freed and its page also. kill_bdev()
5096	 * is called by sb_set_blocksize().
5097	 */
5098	brelse(bh);
5099	/* Validate the filesystem blocksize */
5100	if (!sb_set_blocksize(sb, blocksize)) {
5101		ext4_msg(sb, KERN_ERR, "bad block size %d",
5102				blocksize);
5103		bh = NULL;
5104		goto out;
5105	}
5106
5107	logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
5108	offset = do_div(logical_sb_block, blocksize);
5109	bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
5110	if (IS_ERR(bh)) {
5111		ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
5112		ret = PTR_ERR(bh);
5113		bh = NULL;
5114		goto out;
5115	}
5116	es = (struct ext4_super_block *)(bh->b_data + offset);
5117	sbi->s_es = es;
5118	if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
5119		ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
5120		goto out;
5121	}
5122	*lsb = logical_sb_block;
5123	sbi->s_sbh = bh;
5124	return 0;
5125out:
5126	brelse(bh);
5127	return ret;
5128}
5129
5130static void ext4_hash_info_init(struct super_block *sb)
5131{
5132	struct ext4_sb_info *sbi = EXT4_SB(sb);
5133	struct ext4_super_block *es = sbi->s_es;
5134	unsigned int i;
5135
5136	for (i = 0; i < 4; i++)
5137		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
5138
5139	sbi->s_def_hash_version = es->s_def_hash_version;
5140	if (ext4_has_feature_dir_index(sb)) {
5141		i = le32_to_cpu(es->s_flags);
5142		if (i & EXT2_FLAGS_UNSIGNED_HASH)
5143			sbi->s_hash_unsigned = 3;
5144		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
5145#ifdef __CHAR_UNSIGNED__
5146			if (!sb_rdonly(sb))
5147				es->s_flags |=
5148					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
5149			sbi->s_hash_unsigned = 3;
5150#else
5151			if (!sb_rdonly(sb))
5152				es->s_flags |=
5153					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
5154#endif
5155		}
5156	}
5157}
5158
5159static int ext4_block_group_meta_init(struct super_block *sb, int silent)
5160{
5161	struct ext4_sb_info *sbi = EXT4_SB(sb);
5162	struct ext4_super_block *es = sbi->s_es;
5163	int has_huge_files;
5164
5165	has_huge_files = ext4_has_feature_huge_file(sb);
5166	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
5167						      has_huge_files);
5168	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
5169
5170	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
5171	if (ext4_has_feature_64bit(sb)) {
5172		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
5173		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
5174		    !is_power_of_2(sbi->s_desc_size)) {
5175			ext4_msg(sb, KERN_ERR,
5176			       "unsupported descriptor size %lu",
5177			       sbi->s_desc_size);
5178			return -EINVAL;
5179		}
5180	} else
5181		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
5182
5183	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
5184	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
5185
5186	sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
5187	if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
5188		if (!silent)
5189			ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5190		return -EINVAL;
5191	}
5192	if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
5193	    sbi->s_inodes_per_group > sb->s_blocksize * 8) {
5194		ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
5195			 sbi->s_inodes_per_group);
5196		return -EINVAL;
5197	}
5198	sbi->s_itb_per_group = sbi->s_inodes_per_group /
5199					sbi->s_inodes_per_block;
5200	sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
5201	sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
5202	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
5203	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
5204
5205	return 0;
5206}
5207
5208static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
5209{
5210	struct ext4_super_block *es = NULL;
5211	struct ext4_sb_info *sbi = EXT4_SB(sb);
5212	ext4_fsblk_t logical_sb_block;
5213	struct inode *root;
5214	int needs_recovery;
5215	int err;
5216	ext4_group_t first_not_zeroed;
5217	struct ext4_fs_context *ctx = fc->fs_private;
5218	int silent = fc->sb_flags & SB_SILENT;
5219
5220	/* Set defaults for the variables that will be set during parsing */
5221	if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
5222		ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
5223
5224	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
5225	sbi->s_sectors_written_start =
5226		part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
5227
5228	err = ext4_load_super(sb, &logical_sb_block, silent);
5229	if (err)
5230		goto out_fail;
5231
5232	es = sbi->s_es;
5233	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
5234
5235	err = ext4_init_metadata_csum(sb, es);
5236	if (err)
5237		goto failed_mount;
5238
5239	ext4_set_def_opts(sb, es);
5240
5241	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
5242	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
5243	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
5244	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
5245	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
5246
5247	/*
5248	 * set default s_li_wait_mult for lazyinit, for the case there is
5249	 * no mount option specified.
5250	 */
5251	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
5252
5253	err = ext4_inode_info_init(sb, es);
5254	if (err)
5255		goto failed_mount;
5256
5257	err = parse_apply_sb_mount_options(sb, ctx);
5258	if (err < 0)
5259		goto failed_mount;
5260
5261	sbi->s_def_mount_opt = sbi->s_mount_opt;
5262	sbi->s_def_mount_opt2 = sbi->s_mount_opt2;
5263
5264	err = ext4_check_opt_consistency(fc, sb);
5265	if (err < 0)
5266		goto failed_mount;
5267
5268	ext4_apply_options(fc, sb);
5269
5270	err = ext4_encoding_init(sb, es);
5271	if (err)
5272		goto failed_mount;
5273
5274	err = ext4_check_journal_data_mode(sb);
5275	if (err)
5276		goto failed_mount;
5277
5278	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5279		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5280
5281	/* i_version is always enabled now */
5282	sb->s_flags |= SB_I_VERSION;
5283
5284	err = ext4_check_feature_compatibility(sb, es, silent);
5285	if (err)
5286		goto failed_mount;
5287
5288	err = ext4_block_group_meta_init(sb, silent);
5289	if (err)
5290		goto failed_mount;
5291
5292	ext4_hash_info_init(sb);
5293
5294	err = ext4_handle_clustersize(sb);
5295	if (err)
5296		goto failed_mount;
5297
5298	err = ext4_check_geometry(sb, es);
5299	if (err)
5300		goto failed_mount;
5301
5302	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
5303	spin_lock_init(&sbi->s_error_lock);
5304	INIT_WORK(&sbi->s_sb_upd_work, update_super_work);
5305
5306	err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
5307	if (err)
5308		goto failed_mount3;
5309
5310	err = ext4_es_register_shrinker(sbi);
5311	if (err)
5312		goto failed_mount3;
5313
5314	sbi->s_stripe = ext4_get_stripe_size(sbi);
5315	/*
5316	 * It's hard to get stripe aligned blocks if stripe is not aligned with
5317	 * cluster, just disable stripe and alert user to simpfy code and avoid
5318	 * stripe aligned allocation which will rarely successes.
5319	 */
5320	if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
5321	    sbi->s_stripe % sbi->s_cluster_ratio != 0) {
5322		ext4_msg(sb, KERN_WARNING,
5323			 "stripe (%lu) is not aligned with cluster size (%u), "
5324			 "stripe is disabled",
5325			 sbi->s_stripe, sbi->s_cluster_ratio);
5326		sbi->s_stripe = 0;
5327	}
5328	sbi->s_extent_max_zeroout_kb = 32;
5329
5330	/*
5331	 * set up enough so that it can read an inode
5332	 */
5333	sb->s_op = &ext4_sops;
5334	sb->s_export_op = &ext4_export_ops;
5335	sb->s_xattr = ext4_xattr_handlers;
5336#ifdef CONFIG_FS_ENCRYPTION
5337	sb->s_cop = &ext4_cryptops;
5338#endif
5339#ifdef CONFIG_FS_VERITY
5340	sb->s_vop = &ext4_verityops;
5341#endif
5342#ifdef CONFIG_QUOTA
5343	sb->dq_op = &ext4_quota_operations;
5344	if (ext4_has_feature_quota(sb))
5345		sb->s_qcop = &dquot_quotactl_sysfile_ops;
5346	else
5347		sb->s_qcop = &ext4_qctl_operations;
5348	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
5349#endif
5350	memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
5351
5352	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
5353	mutex_init(&sbi->s_orphan_lock);
5354
5355	ext4_fast_commit_init(sb);
5356
5357	sb->s_root = NULL;
5358
5359	needs_recovery = (es->s_last_orphan != 0 ||
5360			  ext4_has_feature_orphan_present(sb) ||
5361			  ext4_has_feature_journal_needs_recovery(sb));
5362
5363	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
5364		err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
5365		if (err)
5366			goto failed_mount3a;
5367	}
5368
5369	err = -EINVAL;
5370	/*
5371	 * The first inode we look at is the journal inode.  Don't try
5372	 * root first: it may be modified in the journal!
5373	 */
5374	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
5375		err = ext4_load_and_init_journal(sb, es, ctx);
5376		if (err)
5377			goto failed_mount3a;
5378	} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
5379		   ext4_has_feature_journal_needs_recovery(sb)) {
5380		ext4_msg(sb, KERN_ERR, "required journal recovery "
5381		       "suppressed and not mounted read-only");
5382		goto failed_mount3a;
5383	} else {
5384		/* Nojournal mode, all journal mount options are illegal */
5385		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5386			ext4_msg(sb, KERN_ERR, "can't mount with "
5387				 "journal_async_commit, fs mounted w/o journal");
5388			goto failed_mount3a;
5389		}
5390
5391		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
5392			ext4_msg(sb, KERN_ERR, "can't mount with "
5393				 "journal_checksum, fs mounted w/o journal");
5394			goto failed_mount3a;
5395		}
5396		if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
5397			ext4_msg(sb, KERN_ERR, "can't mount with "
5398				 "commit=%lu, fs mounted w/o journal",
5399				 sbi->s_commit_interval / HZ);
5400			goto failed_mount3a;
5401		}
5402		if (EXT4_MOUNT_DATA_FLAGS &
5403		    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
5404			ext4_msg(sb, KERN_ERR, "can't mount with "
5405				 "data=, fs mounted w/o journal");
5406			goto failed_mount3a;
5407		}
5408		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
5409		clear_opt(sb, JOURNAL_CHECKSUM);
5410		clear_opt(sb, DATA_FLAGS);
5411		clear_opt2(sb, JOURNAL_FAST_COMMIT);
5412		sbi->s_journal = NULL;
5413		needs_recovery = 0;
5414	}
5415
5416	if (!test_opt(sb, NO_MBCACHE)) {
5417		sbi->s_ea_block_cache = ext4_xattr_create_cache();
5418		if (!sbi->s_ea_block_cache) {
5419			ext4_msg(sb, KERN_ERR,
5420				 "Failed to create ea_block_cache");
5421			err = -EINVAL;
5422			goto failed_mount_wq;
5423		}
5424
5425		if (ext4_has_feature_ea_inode(sb)) {
5426			sbi->s_ea_inode_cache = ext4_xattr_create_cache();
5427			if (!sbi->s_ea_inode_cache) {
5428				ext4_msg(sb, KERN_ERR,
5429					 "Failed to create ea_inode_cache");
5430				err = -EINVAL;
5431				goto failed_mount_wq;
5432			}
5433		}
5434	}
5435
5436	/*
5437	 * Get the # of file system overhead blocks from the
5438	 * superblock if present.
5439	 */
5440	sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
5441	/* ignore the precalculated value if it is ridiculous */
5442	if (sbi->s_overhead > ext4_blocks_count(es))
5443		sbi->s_overhead = 0;
5444	/*
5445	 * If the bigalloc feature is not enabled recalculating the
5446	 * overhead doesn't take long, so we might as well just redo
5447	 * it to make sure we are using the correct value.
5448	 */
5449	if (!ext4_has_feature_bigalloc(sb))
5450		sbi->s_overhead = 0;
5451	if (sbi->s_overhead == 0) {
5452		err = ext4_calculate_overhead(sb);
5453		if (err)
5454			goto failed_mount_wq;
5455	}
5456
5457	/*
5458	 * The maximum number of concurrent works can be high and
5459	 * concurrency isn't really necessary.  Limit it to 1.
5460	 */
5461	EXT4_SB(sb)->rsv_conversion_wq =
5462		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
5463	if (!EXT4_SB(sb)->rsv_conversion_wq) {
5464		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
5465		err = -ENOMEM;
5466		goto failed_mount4;
5467	}
5468
5469	/*
5470	 * The jbd2_journal_load will have done any necessary log recovery,
5471	 * so we can safely mount the rest of the filesystem now.
5472	 */
5473
5474	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
5475	if (IS_ERR(root)) {
5476		ext4_msg(sb, KERN_ERR, "get root inode failed");
5477		err = PTR_ERR(root);
5478		root = NULL;
5479		goto failed_mount4;
5480	}
5481	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
5482		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
5483		iput(root);
5484		err = -EFSCORRUPTED;
5485		goto failed_mount4;
5486	}
5487
5488	sb->s_root = d_make_root(root);
5489	if (!sb->s_root) {
5490		ext4_msg(sb, KERN_ERR, "get root dentry failed");
5491		err = -ENOMEM;
5492		goto failed_mount4;
5493	}
5494
5495	err = ext4_setup_super(sb, es, sb_rdonly(sb));
5496	if (err == -EROFS) {
5497		sb->s_flags |= SB_RDONLY;
5498	} else if (err)
5499		goto failed_mount4a;
5500
5501	ext4_set_resv_clusters(sb);
5502
5503	if (test_opt(sb, BLOCK_VALIDITY)) {
5504		err = ext4_setup_system_zone(sb);
5505		if (err) {
5506			ext4_msg(sb, KERN_ERR, "failed to initialize system "
5507				 "zone (%d)", err);
5508			goto failed_mount4a;
5509		}
5510	}
5511	ext4_fc_replay_cleanup(sb);
5512
5513	ext4_ext_init(sb);
5514
5515	/*
5516	 * Enable optimize_scan if number of groups is > threshold. This can be
5517	 * turned off by passing "mb_optimize_scan=0". This can also be
5518	 * turned on forcefully by passing "mb_optimize_scan=1".
5519	 */
5520	if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
5521		if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
5522			set_opt2(sb, MB_OPTIMIZE_SCAN);
5523		else
5524			clear_opt2(sb, MB_OPTIMIZE_SCAN);
5525	}
5526
5527	err = ext4_mb_init(sb);
5528	if (err) {
5529		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
5530			 err);
5531		goto failed_mount5;
5532	}
5533
5534	/*
5535	 * We can only set up the journal commit callback once
5536	 * mballoc is initialized
5537	 */
5538	if (sbi->s_journal)
5539		sbi->s_journal->j_commit_callback =
5540			ext4_journal_commit_callback;
5541
5542	err = ext4_percpu_param_init(sbi);
5543	if (err)
5544		goto failed_mount6;
5545
5546	if (ext4_has_feature_flex_bg(sb))
5547		if (!ext4_fill_flex_info(sb)) {
5548			ext4_msg(sb, KERN_ERR,
5549			       "unable to initialize "
5550			       "flex_bg meta info!");
5551			err = -ENOMEM;
5552			goto failed_mount6;
5553		}
5554
5555	err = ext4_register_li_request(sb, first_not_zeroed);
5556	if (err)
5557		goto failed_mount6;
5558
5559	err = ext4_register_sysfs(sb);
5560	if (err)
5561		goto failed_mount7;
5562
5563	err = ext4_init_orphan_info(sb);
5564	if (err)
5565		goto failed_mount8;
5566#ifdef CONFIG_QUOTA
5567	/* Enable quota usage during mount. */
5568	if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
5569		err = ext4_enable_quotas(sb);
5570		if (err)
5571			goto failed_mount9;
5572	}
5573#endif  /* CONFIG_QUOTA */
5574
5575	/*
5576	 * Save the original bdev mapping's wb_err value which could be
5577	 * used to detect the metadata async write error.
5578	 */
5579	spin_lock_init(&sbi->s_bdev_wb_lock);
5580	errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
5581				 &sbi->s_bdev_wb_err);
5582	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
5583	ext4_orphan_cleanup(sb, es);
5584	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
5585	/*
5586	 * Update the checksum after updating free space/inode counters and
5587	 * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
5588	 * checksum in the buffer cache until it is written out and
5589	 * e2fsprogs programs trying to open a file system immediately
5590	 * after it is mounted can fail.
5591	 */
5592	ext4_superblock_csum_set(sb);
5593	if (needs_recovery) {
5594		ext4_msg(sb, KERN_INFO, "recovery complete");
5595		err = ext4_mark_recovery_complete(sb, es);
5596		if (err)
5597			goto failed_mount10;
5598	}
5599
5600	if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
5601		ext4_msg(sb, KERN_WARNING,
5602			 "mounting with \"discard\" option, but the device does not support discard");
5603
5604	if (es->s_error_count)
5605		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
5606
5607	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
5608	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
5609	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
5610	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
5611	atomic_set(&sbi->s_warning_count, 0);
5612	atomic_set(&sbi->s_msg_count, 0);
5613
5614	return 0;
5615
5616failed_mount10:
5617	ext4_quotas_off(sb, EXT4_MAXQUOTAS);
5618failed_mount9: __maybe_unused
5619	ext4_release_orphan_info(sb);
5620failed_mount8:
5621	ext4_unregister_sysfs(sb);
5622	kobject_put(&sbi->s_kobj);
5623failed_mount7:
5624	ext4_unregister_li_request(sb);
5625failed_mount6:
5626	ext4_mb_release(sb);
5627	ext4_flex_groups_free(sbi);
5628	ext4_percpu_param_destroy(sbi);
5629failed_mount5:
5630	ext4_ext_release(sb);
5631	ext4_release_system_zone(sb);
5632failed_mount4a:
5633	dput(sb->s_root);
5634	sb->s_root = NULL;
5635failed_mount4:
5636	ext4_msg(sb, KERN_ERR, "mount failed");
5637	if (EXT4_SB(sb)->rsv_conversion_wq)
5638		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
5639failed_mount_wq:
5640	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
5641	sbi->s_ea_inode_cache = NULL;
5642
5643	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
5644	sbi->s_ea_block_cache = NULL;
5645
5646	if (sbi->s_journal) {
5647		/* flush s_sb_upd_work before journal destroy. */
5648		flush_work(&sbi->s_sb_upd_work);
5649		jbd2_journal_destroy(sbi->s_journal);
5650		sbi->s_journal = NULL;
5651	}
5652failed_mount3a:
5653	ext4_es_unregister_shrinker(sbi);
5654failed_mount3:
5655	/* flush s_sb_upd_work before sbi destroy */
5656	flush_work(&sbi->s_sb_upd_work);
5657	del_timer_sync(&sbi->s_err_report);
5658	ext4_stop_mmpd(sbi);
5659	ext4_group_desc_free(sbi);
5660failed_mount:
5661	if (sbi->s_chksum_driver)
5662		crypto_free_shash(sbi->s_chksum_driver);
5663
5664#if IS_ENABLED(CONFIG_UNICODE)
5665	utf8_unload(sb->s_encoding);
5666#endif
5667
5668#ifdef CONFIG_QUOTA
5669	for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
5670		kfree(get_qf_name(sb, sbi, i));
5671#endif
5672	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
5673	brelse(sbi->s_sbh);
5674	if (sbi->s_journal_bdev) {
5675		invalidate_bdev(sbi->s_journal_bdev);
5676		blkdev_put(sbi->s_journal_bdev, sb);
5677	}
5678out_fail:
5679	invalidate_bdev(sb->s_bdev);
5680	sb->s_fs_info = NULL;
5681	return err;
5682}
5683
5684static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
5685{
5686	struct ext4_fs_context *ctx = fc->fs_private;
5687	struct ext4_sb_info *sbi;
5688	const char *descr;
5689	int ret;
5690
5691	sbi = ext4_alloc_sbi(sb);
5692	if (!sbi)
5693		return -ENOMEM;
5694
5695	fc->s_fs_info = sbi;
5696
5697	/* Cleanup superblock name */
5698	strreplace(sb->s_id, '/', '!');
5699
5700	sbi->s_sb_block = 1;	/* Default super block location */
5701	if (ctx->spec & EXT4_SPEC_s_sb_block)
5702		sbi->s_sb_block = ctx->s_sb_block;
5703
5704	ret = __ext4_fill_super(fc, sb);
5705	if (ret < 0)
5706		goto free_sbi;
5707
5708	if (sbi->s_journal) {
5709		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
5710			descr = " journalled data mode";
5711		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
5712			descr = " ordered data mode";
5713		else
5714			descr = " writeback data mode";
5715	} else
5716		descr = "out journal";
5717
5718	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
5719		ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
5720			 "Quota mode: %s.", &sb->s_uuid,
5721			 sb_rdonly(sb) ? "ro" : "r/w", descr,
5722			 ext4_quota_mode(sb));
5723
5724	/* Update the s_overhead_clusters if necessary */
5725	ext4_update_overhead(sb, false);
5726	return 0;
5727
5728free_sbi:
5729	ext4_free_sbi(sbi);
5730	fc->s_fs_info = NULL;
5731	return ret;
5732}
5733
5734static int ext4_get_tree(struct fs_context *fc)
5735{
5736	return get_tree_bdev(fc, ext4_fill_super);
5737}
5738
5739/*
5740 * Setup any per-fs journal parameters now.  We'll do this both on
5741 * initial mount, once the journal has been initialised but before we've
5742 * done any recovery; and again on any subsequent remount.
5743 */
5744static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
5745{
5746	struct ext4_sb_info *sbi = EXT4_SB(sb);
5747
5748	journal->j_commit_interval = sbi->s_commit_interval;
5749	journal->j_min_batch_time = sbi->s_min_batch_time;
5750	journal->j_max_batch_time = sbi->s_max_batch_time;
5751	ext4_fc_init(sb, journal);
5752
5753	write_lock(&journal->j_state_lock);
5754	if (test_opt(sb, BARRIER))
5755		journal->j_flags |= JBD2_BARRIER;
5756	else
5757		journal->j_flags &= ~JBD2_BARRIER;
5758	if (test_opt(sb, DATA_ERR_ABORT))
5759		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
5760	else
5761		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
5762	/*
5763	 * Always enable journal cycle record option, letting the journal
5764	 * records log transactions continuously between each mount.
5765	 */
5766	journal->j_flags |= JBD2_CYCLE_RECORD;
5767	write_unlock(&journal->j_state_lock);
5768}
5769
5770static struct inode *ext4_get_journal_inode(struct super_block *sb,
5771					     unsigned int journal_inum)
5772{
5773	struct inode *journal_inode;
5774
5775	/*
5776	 * Test for the existence of a valid inode on disk.  Bad things
5777	 * happen if we iget() an unused inode, as the subsequent iput()
5778	 * will try to delete it.
5779	 */
5780	journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
5781	if (IS_ERR(journal_inode)) {
5782		ext4_msg(sb, KERN_ERR, "no journal found");
5783		return ERR_CAST(journal_inode);
5784	}
5785	if (!journal_inode->i_nlink) {
5786		make_bad_inode(journal_inode);
5787		iput(journal_inode);
5788		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
5789		return ERR_PTR(-EFSCORRUPTED);
5790	}
5791	if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
5792		ext4_msg(sb, KERN_ERR, "invalid journal inode");
5793		iput(journal_inode);
5794		return ERR_PTR(-EFSCORRUPTED);
5795	}
5796
5797	ext4_debug("Journal inode found at %p: %lld bytes\n",
5798		  journal_inode, journal_inode->i_size);
5799	return journal_inode;
5800}
5801
5802static int ext4_journal_bmap(journal_t *journal, sector_t *block)
5803{
5804	struct ext4_map_blocks map;
5805	int ret;
5806
5807	if (journal->j_inode == NULL)
5808		return 0;
5809
5810	map.m_lblk = *block;
5811	map.m_len = 1;
5812	ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
5813	if (ret <= 0) {
5814		ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
5815			 "journal bmap failed: block %llu ret %d\n",
5816			 *block, ret);
5817		jbd2_journal_abort(journal, ret ? ret : -EIO);
5818		return ret;
5819	}
5820	*block = map.m_pblk;
5821	return 0;
5822}
5823
5824static journal_t *ext4_open_inode_journal(struct super_block *sb,
5825					  unsigned int journal_inum)
5826{
5827	struct inode *journal_inode;
5828	journal_t *journal;
5829
5830	journal_inode = ext4_get_journal_inode(sb, journal_inum);
5831	if (IS_ERR(journal_inode))
5832		return ERR_CAST(journal_inode);
5833
5834	journal = jbd2_journal_init_inode(journal_inode);
5835	if (IS_ERR(journal)) {
5836		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
5837		iput(journal_inode);
5838		return ERR_CAST(journal);
5839	}
5840	journal->j_private = sb;
5841	journal->j_bmap = ext4_journal_bmap;
5842	ext4_init_journal_params(sb, journal);
5843	return journal;
5844}
5845
5846static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
5847					dev_t j_dev, ext4_fsblk_t *j_start,
5848					ext4_fsblk_t *j_len)
5849{
5850	struct buffer_head *bh;
5851	struct block_device *bdev;
5852	int hblock, blocksize;
5853	ext4_fsblk_t sb_block;
5854	unsigned long offset;
5855	struct ext4_super_block *es;
5856	int errno;
5857
5858	/* see get_tree_bdev why this is needed and safe */
5859	up_write(&sb->s_umount);
5860	bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
5861				 &fs_holder_ops);
5862	down_write(&sb->s_umount);
5863	if (IS_ERR(bdev)) {
5864		ext4_msg(sb, KERN_ERR,
5865			 "failed to open journal device unknown-block(%u,%u) %ld",
5866			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
5867		return ERR_CAST(bdev);
5868	}
5869
5870	blocksize = sb->s_blocksize;
5871	hblock = bdev_logical_block_size(bdev);
5872	if (blocksize < hblock) {
5873		ext4_msg(sb, KERN_ERR,
5874			"blocksize too small for journal device");
5875		errno = -EINVAL;
5876		goto out_bdev;
5877	}
5878
5879	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
5880	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
5881	set_blocksize(bdev, blocksize);
5882	bh = __bread(bdev, sb_block, blocksize);
5883	if (!bh) {
5884		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
5885		       "external journal");
5886		errno = -EINVAL;
5887		goto out_bdev;
5888	}
5889
5890	es = (struct ext4_super_block *) (bh->b_data + offset);
5891	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
5892	    !(le32_to_cpu(es->s_feature_incompat) &
5893	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
5894		ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
5895		errno = -EFSCORRUPTED;
5896		goto out_bh;
5897	}
5898
5899	if ((le32_to_cpu(es->s_feature_ro_compat) &
5900	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
5901	    es->s_checksum != ext4_superblock_csum(sb, es)) {
5902		ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
5903		errno = -EFSCORRUPTED;
5904		goto out_bh;
5905	}
5906
5907	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
5908		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
5909		errno = -EFSCORRUPTED;
5910		goto out_bh;
5911	}
5912
5913	*j_start = sb_block + 1;
5914	*j_len = ext4_blocks_count(es);
5915	brelse(bh);
5916	return bdev;
5917
5918out_bh:
5919	brelse(bh);
5920out_bdev:
5921	blkdev_put(bdev, sb);
5922	return ERR_PTR(errno);
5923}
5924
5925static journal_t *ext4_open_dev_journal(struct super_block *sb,
5926					dev_t j_dev)
5927{
5928	journal_t *journal;
5929	ext4_fsblk_t j_start;
5930	ext4_fsblk_t j_len;
5931	struct block_device *journal_bdev;
5932	int errno = 0;
5933
5934	journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
5935	if (IS_ERR(journal_bdev))
5936		return ERR_CAST(journal_bdev);
5937
5938	journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
5939					j_len, sb->s_blocksize);
5940	if (IS_ERR(journal)) {
5941		ext4_msg(sb, KERN_ERR, "failed to create device journal");
5942		errno = PTR_ERR(journal);
5943		goto out_bdev;
5944	}
5945	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
5946		ext4_msg(sb, KERN_ERR, "External journal has more than one "
5947					"user (unsupported) - %d",
5948			be32_to_cpu(journal->j_superblock->s_nr_users));
5949		errno = -EINVAL;
5950		goto out_journal;
5951	}
5952	journal->j_private = sb;
5953	EXT4_SB(sb)->s_journal_bdev = journal_bdev;
5954	ext4_init_journal_params(sb, journal);
5955	return journal;
5956
5957out_journal:
5958	jbd2_journal_destroy(journal);
5959out_bdev:
5960	blkdev_put(journal_bdev, sb);
5961	return ERR_PTR(errno);
5962}
5963
5964static int ext4_load_journal(struct super_block *sb,
5965			     struct ext4_super_block *es,
5966			     unsigned long journal_devnum)
5967{
5968	journal_t *journal;
5969	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
5970	dev_t journal_dev;
5971	int err = 0;
5972	int really_read_only;
5973	int journal_dev_ro;
5974
5975	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5976		return -EFSCORRUPTED;
5977
5978	if (journal_devnum &&
5979	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5980		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
5981			"numbers have changed");
5982		journal_dev = new_decode_dev(journal_devnum);
5983	} else
5984		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
5985
5986	if (journal_inum && journal_dev) {
5987		ext4_msg(sb, KERN_ERR,
5988			 "filesystem has both journal inode and journal device!");
5989		return -EINVAL;
5990	}
5991
5992	if (journal_inum) {
5993		journal = ext4_open_inode_journal(sb, journal_inum);
5994		if (IS_ERR(journal))
5995			return PTR_ERR(journal);
5996	} else {
5997		journal = ext4_open_dev_journal(sb, journal_dev);
5998		if (IS_ERR(journal))
5999			return PTR_ERR(journal);
6000	}
6001
6002	journal_dev_ro = bdev_read_only(journal->j_dev);
6003	really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
6004
6005	if (journal_dev_ro && !sb_rdonly(sb)) {
6006		ext4_msg(sb, KERN_ERR,
6007			 "journal device read-only, try mounting with '-o ro'");
6008		err = -EROFS;
6009		goto err_out;
6010	}
6011
6012	/*
6013	 * Are we loading a blank journal or performing recovery after a
6014	 * crash?  For recovery, we need to check in advance whether we
6015	 * can get read-write access to the device.
6016	 */
6017	if (ext4_has_feature_journal_needs_recovery(sb)) {
6018		if (sb_rdonly(sb)) {
6019			ext4_msg(sb, KERN_INFO, "INFO: recovery "
6020					"required on readonly filesystem");
6021			if (really_read_only) {
6022				ext4_msg(sb, KERN_ERR, "write access "
6023					"unavailable, cannot proceed "
6024					"(try mounting with noload)");
6025				err = -EROFS;
6026				goto err_out;
6027			}
6028			ext4_msg(sb, KERN_INFO, "write access will "
6029			       "be enabled during recovery");
6030		}
6031	}
6032
6033	if (!(journal->j_flags & JBD2_BARRIER))
6034		ext4_msg(sb, KERN_INFO, "barriers disabled");
6035
6036	if (!ext4_has_feature_journal_needs_recovery(sb))
6037		err = jbd2_journal_wipe(journal, !really_read_only);
6038	if (!err) {
6039		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
6040		__le16 orig_state;
6041		bool changed = false;
6042
6043		if (save)
6044			memcpy(save, ((char *) es) +
6045			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
6046		err = jbd2_journal_load(journal);
6047		if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
6048				   save, EXT4_S_ERR_LEN)) {
6049			memcpy(((char *) es) + EXT4_S_ERR_START,
6050			       save, EXT4_S_ERR_LEN);
6051			changed = true;
6052		}
6053		kfree(save);
6054		orig_state = es->s_state;
6055		es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
6056					   EXT4_ERROR_FS);
6057		if (orig_state != es->s_state)
6058			changed = true;
6059		/* Write out restored error information to the superblock */
6060		if (changed && !really_read_only) {
6061			int err2;
6062			err2 = ext4_commit_super(sb);
6063			err = err ? : err2;
6064		}
6065	}
6066
6067	if (err) {
6068		ext4_msg(sb, KERN_ERR, "error loading journal");
6069		goto err_out;
6070	}
6071
6072	EXT4_SB(sb)->s_journal = journal;
6073	err = ext4_clear_journal_err(sb, es);
6074	if (err) {
6075		EXT4_SB(sb)->s_journal = NULL;
6076		jbd2_journal_destroy(journal);
6077		return err;
6078	}
6079
6080	if (!really_read_only && journal_devnum &&
6081	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
6082		es->s_journal_dev = cpu_to_le32(journal_devnum);
6083		ext4_commit_super(sb);
6084	}
6085	if (!really_read_only && journal_inum &&
6086	    journal_inum != le32_to_cpu(es->s_journal_inum)) {
6087		es->s_journal_inum = cpu_to_le32(journal_inum);
6088		ext4_commit_super(sb);
6089	}
6090
6091	return 0;
6092
6093err_out:
6094	jbd2_journal_destroy(journal);
6095	return err;
6096}
6097
6098/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
6099static void ext4_update_super(struct super_block *sb)
6100{
6101	struct ext4_sb_info *sbi = EXT4_SB(sb);
6102	struct ext4_super_block *es = sbi->s_es;
6103	struct buffer_head *sbh = sbi->s_sbh;
6104
6105	lock_buffer(sbh);
6106	/*
6107	 * If the file system is mounted read-only, don't update the
6108	 * superblock write time.  This avoids updating the superblock
6109	 * write time when we are mounting the root file system
6110	 * read/only but we need to replay the journal; at that point,
6111	 * for people who are east of GMT and who make their clock
6112	 * tick in localtime for Windows bug-for-bug compatibility,
6113	 * the clock is set in the future, and this will cause e2fsck
6114	 * to complain and force a full file system check.
6115	 */
6116	if (!sb_rdonly(sb))
6117		ext4_update_tstamp(es, s_wtime);
6118	es->s_kbytes_written =
6119		cpu_to_le64(sbi->s_kbytes_written +
6120		    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
6121		      sbi->s_sectors_written_start) >> 1));
6122	if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
6123		ext4_free_blocks_count_set(es,
6124			EXT4_C2B(sbi, percpu_counter_sum_positive(
6125				&sbi->s_freeclusters_counter)));
6126	if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
6127		es->s_free_inodes_count =
6128			cpu_to_le32(percpu_counter_sum_positive(
6129				&sbi->s_freeinodes_counter));
6130	/* Copy error information to the on-disk superblock */
6131	spin_lock(&sbi->s_error_lock);
6132	if (sbi->s_add_error_count > 0) {
6133		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
6134		if (!es->s_first_error_time && !es->s_first_error_time_hi) {
6135			__ext4_update_tstamp(&es->s_first_error_time,
6136					     &es->s_first_error_time_hi,
6137					     sbi->s_first_error_time);
6138			strncpy(es->s_first_error_func, sbi->s_first_error_func,
6139				sizeof(es->s_first_error_func));
6140			es->s_first_error_line =
6141				cpu_to_le32(sbi->s_first_error_line);
6142			es->s_first_error_ino =
6143				cpu_to_le32(sbi->s_first_error_ino);
6144			es->s_first_error_block =
6145				cpu_to_le64(sbi->s_first_error_block);
6146			es->s_first_error_errcode =
6147				ext4_errno_to_code(sbi->s_first_error_code);
6148		}
6149		__ext4_update_tstamp(&es->s_last_error_time,
6150				     &es->s_last_error_time_hi,
6151				     sbi->s_last_error_time);
6152		strncpy(es->s_last_error_func, sbi->s_last_error_func,
6153			sizeof(es->s_last_error_func));
6154		es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
6155		es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
6156		es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
6157		es->s_last_error_errcode =
6158				ext4_errno_to_code(sbi->s_last_error_code);
6159		/*
6160		 * Start the daily error reporting function if it hasn't been
6161		 * started already
6162		 */
6163		if (!es->s_error_count)
6164			mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
6165		le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
6166		sbi->s_add_error_count = 0;
6167	}
6168	spin_unlock(&sbi->s_error_lock);
6169
6170	ext4_superblock_csum_set(sb);
6171	unlock_buffer(sbh);
6172}
6173
6174static int ext4_commit_super(struct super_block *sb)
6175{
6176	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
6177
6178	if (!sbh)
6179		return -EINVAL;
6180	if (block_device_ejected(sb))
6181		return -ENODEV;
6182
6183	ext4_update_super(sb);
6184
6185	lock_buffer(sbh);
6186	/* Buffer got discarded which means block device got invalidated */
6187	if (!buffer_mapped(sbh)) {
6188		unlock_buffer(sbh);
6189		return -EIO;
6190	}
6191
6192	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
6193		/*
6194		 * Oh, dear.  A previous attempt to write the
6195		 * superblock failed.  This could happen because the
6196		 * USB device was yanked out.  Or it could happen to
6197		 * be a transient write error and maybe the block will
6198		 * be remapped.  Nothing we can do but to retry the
6199		 * write and hope for the best.
6200		 */
6201		ext4_msg(sb, KERN_ERR, "previous I/O error to "
6202		       "superblock detected");
6203		clear_buffer_write_io_error(sbh);
6204		set_buffer_uptodate(sbh);
6205	}
6206	get_bh(sbh);
6207	/* Clear potential dirty bit if it was journalled update */
6208	clear_buffer_dirty(sbh);
6209	sbh->b_end_io = end_buffer_write_sync;
6210	submit_bh(REQ_OP_WRITE | REQ_SYNC |
6211		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
6212	wait_on_buffer(sbh);
6213	if (buffer_write_io_error(sbh)) {
6214		ext4_msg(sb, KERN_ERR, "I/O error while writing "
6215		       "superblock");
6216		clear_buffer_write_io_error(sbh);
6217		set_buffer_uptodate(sbh);
6218		return -EIO;
6219	}
6220	return 0;
6221}
6222
6223/*
6224 * Have we just finished recovery?  If so, and if we are mounting (or
6225 * remounting) the filesystem readonly, then we will end up with a
6226 * consistent fs on disk.  Record that fact.
6227 */
6228static int ext4_mark_recovery_complete(struct super_block *sb,
6229				       struct ext4_super_block *es)
6230{
6231	int err;
6232	journal_t *journal = EXT4_SB(sb)->s_journal;
6233
6234	if (!ext4_has_feature_journal(sb)) {
6235		if (journal != NULL) {
6236			ext4_error(sb, "Journal got removed while the fs was "
6237				   "mounted!");
6238			return -EFSCORRUPTED;
6239		}
6240		return 0;
6241	}
6242	jbd2_journal_lock_updates(journal);
6243	err = jbd2_journal_flush(journal, 0);
6244	if (err < 0)
6245		goto out;
6246
6247	if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
6248	    ext4_has_feature_orphan_present(sb))) {
6249		if (!ext4_orphan_file_empty(sb)) {
6250			ext4_error(sb, "Orphan file not empty on read-only fs.");
6251			err = -EFSCORRUPTED;
6252			goto out;
6253		}
6254		ext4_clear_feature_journal_needs_recovery(sb);
6255		ext4_clear_feature_orphan_present(sb);
6256		ext4_commit_super(sb);
6257	}
6258out:
6259	jbd2_journal_unlock_updates(journal);
6260	return err;
6261}
6262
6263/*
6264 * If we are mounting (or read-write remounting) a filesystem whose journal
6265 * has recorded an error from a previous lifetime, move that error to the
6266 * main filesystem now.
6267 */
6268static int ext4_clear_journal_err(struct super_block *sb,
6269				   struct ext4_super_block *es)
6270{
6271	journal_t *journal;
6272	int j_errno;
6273	const char *errstr;
6274
6275	if (!ext4_has_feature_journal(sb)) {
6276		ext4_error(sb, "Journal got removed while the fs was mounted!");
6277		return -EFSCORRUPTED;
6278	}
6279
6280	journal = EXT4_SB(sb)->s_journal;
6281
6282	/*
6283	 * Now check for any error status which may have been recorded in the
6284	 * journal by a prior ext4_error() or ext4_abort()
6285	 */
6286
6287	j_errno = jbd2_journal_errno(journal);
6288	if (j_errno) {
6289		char nbuf[16];
6290
6291		errstr = ext4_decode_error(sb, j_errno, nbuf);
6292		ext4_warning(sb, "Filesystem error recorded "
6293			     "from previous mount: %s", errstr);
6294
6295		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
6296		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
6297		j_errno = ext4_commit_super(sb);
6298		if (j_errno)
6299			return j_errno;
6300		ext4_warning(sb, "Marked fs in need of filesystem check.");
6301
6302		jbd2_journal_clear_err(journal);
6303		jbd2_journal_update_sb_errno(journal);
6304	}
6305	return 0;
6306}
6307
6308/*
6309 * Force the running and committing transactions to commit,
6310 * and wait on the commit.
6311 */
6312int ext4_force_commit(struct super_block *sb)
6313{
6314	return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
6315}
6316
6317static int ext4_sync_fs(struct super_block *sb, int wait)
6318{
6319	int ret = 0;
6320	tid_t target;
6321	bool needs_barrier = false;
6322	struct ext4_sb_info *sbi = EXT4_SB(sb);
6323
6324	if (unlikely(ext4_forced_shutdown(sb)))
6325		return 0;
6326
6327	trace_ext4_sync_fs(sb, wait);
6328	flush_workqueue(sbi->rsv_conversion_wq);
6329	/*
6330	 * Writeback quota in non-journalled quota case - journalled quota has
6331	 * no dirty dquots
6332	 */
6333	dquot_writeback_dquots(sb, -1);
6334	/*
6335	 * Data writeback is possible w/o journal transaction, so barrier must
6336	 * being sent at the end of the function. But we can skip it if
6337	 * transaction_commit will do it for us.
6338	 */
6339	if (sbi->s_journal) {
6340		target = jbd2_get_latest_transaction(sbi->s_journal);
6341		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
6342		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
6343			needs_barrier = true;
6344
6345		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
6346			if (wait)
6347				ret = jbd2_log_wait_commit(sbi->s_journal,
6348							   target);
6349		}
6350	} else if (wait && test_opt(sb, BARRIER))
6351		needs_barrier = true;
6352	if (needs_barrier) {
6353		int err;
6354		err = blkdev_issue_flush(sb->s_bdev);
6355		if (!ret)
6356			ret = err;
6357	}
6358
6359	return ret;
6360}
6361
6362/*
6363 * LVM calls this function before a (read-only) snapshot is created.  This
6364 * gives us a chance to flush the journal completely and mark the fs clean.
6365 *
6366 * Note that only this function cannot bring a filesystem to be in a clean
6367 * state independently. It relies on upper layer to stop all data & metadata
6368 * modifications.
6369 */
6370static int ext4_freeze(struct super_block *sb)
6371{
6372	int error = 0;
6373	journal_t *journal = EXT4_SB(sb)->s_journal;
6374
6375	if (journal) {
6376		/* Now we set up the journal barrier. */
6377		jbd2_journal_lock_updates(journal);
6378
6379		/*
6380		 * Don't clear the needs_recovery flag if we failed to
6381		 * flush the journal.
6382		 */
6383		error = jbd2_journal_flush(journal, 0);
6384		if (error < 0)
6385			goto out;
6386
6387		/* Journal blocked and flushed, clear needs_recovery flag. */
6388		ext4_clear_feature_journal_needs_recovery(sb);
6389		if (ext4_orphan_file_empty(sb))
6390			ext4_clear_feature_orphan_present(sb);
6391	}
6392
6393	error = ext4_commit_super(sb);
6394out:
6395	if (journal)
6396		/* we rely on upper layer to stop further updates */
6397		jbd2_journal_unlock_updates(journal);
6398	return error;
6399}
6400
6401/*
6402 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
6403 * flag here, even though the filesystem is not technically dirty yet.
6404 */
6405static int ext4_unfreeze(struct super_block *sb)
6406{
6407	if (ext4_forced_shutdown(sb))
6408		return 0;
6409
6410	if (EXT4_SB(sb)->s_journal) {
6411		/* Reset the needs_recovery flag before the fs is unlocked. */
6412		ext4_set_feature_journal_needs_recovery(sb);
6413		if (ext4_has_feature_orphan_file(sb))
6414			ext4_set_feature_orphan_present(sb);
6415	}
6416
6417	ext4_commit_super(sb);
6418	return 0;
6419}
6420
6421/*
6422 * Structure to save mount options for ext4_remount's benefit
6423 */
6424struct ext4_mount_options {
6425	unsigned long s_mount_opt;
6426	unsigned long s_mount_opt2;
6427	kuid_t s_resuid;
6428	kgid_t s_resgid;
6429	unsigned long s_commit_interval;
6430	u32 s_min_batch_time, s_max_batch_time;
6431#ifdef CONFIG_QUOTA
6432	int s_jquota_fmt;
6433	char *s_qf_names[EXT4_MAXQUOTAS];
6434#endif
6435};
6436
6437static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
6438{
6439	struct ext4_fs_context *ctx = fc->fs_private;
6440	struct ext4_super_block *es;
6441	struct ext4_sb_info *sbi = EXT4_SB(sb);
6442	unsigned long old_sb_flags;
6443	struct ext4_mount_options old_opts;
6444	ext4_group_t g;
6445	int err = 0;
6446	int alloc_ctx;
6447#ifdef CONFIG_QUOTA
6448	int enable_quota = 0;
6449	int i, j;
6450	char *to_free[EXT4_MAXQUOTAS];
6451#endif
6452
6453
6454	/* Store the original options */
6455	old_sb_flags = sb->s_flags;
6456	old_opts.s_mount_opt = sbi->s_mount_opt;
6457	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
6458	old_opts.s_resuid = sbi->s_resuid;
6459	old_opts.s_resgid = sbi->s_resgid;
6460	old_opts.s_commit_interval = sbi->s_commit_interval;
6461	old_opts.s_min_batch_time = sbi->s_min_batch_time;
6462	old_opts.s_max_batch_time = sbi->s_max_batch_time;
6463#ifdef CONFIG_QUOTA
6464	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
6465	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6466		if (sbi->s_qf_names[i]) {
6467			char *qf_name = get_qf_name(sb, sbi, i);
6468
6469			old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
6470			if (!old_opts.s_qf_names[i]) {
6471				for (j = 0; j < i; j++)
6472					kfree(old_opts.s_qf_names[j]);
6473				return -ENOMEM;
6474			}
6475		} else
6476			old_opts.s_qf_names[i] = NULL;
6477#endif
6478	if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
6479		if (sbi->s_journal && sbi->s_journal->j_task->io_context)
6480			ctx->journal_ioprio =
6481				sbi->s_journal->j_task->io_context->ioprio;
6482		else
6483			ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
6484
6485	}
6486
6487	/*
6488	 * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
6489	 * two calls to ext4_should_dioread_nolock() to return inconsistent
6490	 * values, triggering WARN_ON in ext4_add_complete_io(). we grab
6491	 * here s_writepages_rwsem to avoid race between writepages ops and
6492	 * remount.
6493	 */
6494	alloc_ctx = ext4_writepages_down_write(sb);
6495	ext4_apply_options(fc, sb);
6496	ext4_writepages_up_write(sb, alloc_ctx);
6497
6498	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
6499	    test_opt(sb, JOURNAL_CHECKSUM)) {
6500		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
6501			 "during remount not supported; ignoring");
6502		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
6503	}
6504
6505	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
6506		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
6507			ext4_msg(sb, KERN_ERR, "can't mount with "
6508				 "both data=journal and delalloc");
6509			err = -EINVAL;
6510			goto restore_opts;
6511		}
6512		if (test_opt(sb, DIOREAD_NOLOCK)) {
6513			ext4_msg(sb, KERN_ERR, "can't mount with "
6514				 "both data=journal and dioread_nolock");
6515			err = -EINVAL;
6516			goto restore_opts;
6517		}
6518	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
6519		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
6520			ext4_msg(sb, KERN_ERR, "can't mount with "
6521				"journal_async_commit in data=ordered mode");
6522			err = -EINVAL;
6523			goto restore_opts;
6524		}
6525	}
6526
6527	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
6528		ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
6529		err = -EINVAL;
6530		goto restore_opts;
6531	}
6532
6533	if (test_opt2(sb, ABORT))
6534		ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
6535
6536	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
6537		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
6538
6539	es = sbi->s_es;
6540
6541	if (sbi->s_journal) {
6542		ext4_init_journal_params(sb, sbi->s_journal);
6543		set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
6544	}
6545
6546	/* Flush outstanding errors before changing fs state */
6547	flush_work(&sbi->s_sb_upd_work);
6548
6549	if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
6550		if (ext4_forced_shutdown(sb)) {
6551			err = -EROFS;
6552			goto restore_opts;
6553		}
6554
6555		if (fc->sb_flags & SB_RDONLY) {
6556			err = sync_filesystem(sb);
6557			if (err < 0)
6558				goto restore_opts;
6559			err = dquot_suspend(sb, -1);
6560			if (err < 0)
6561				goto restore_opts;
6562
6563			/*
6564			 * First of all, the unconditional stuff we have to do
6565			 * to disable replay of the journal when we next remount
6566			 */
6567			sb->s_flags |= SB_RDONLY;
6568
6569			/*
6570			 * OK, test if we are remounting a valid rw partition
6571			 * readonly, and if so set the rdonly flag and then
6572			 * mark the partition as valid again.
6573			 */
6574			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
6575			    (sbi->s_mount_state & EXT4_VALID_FS))
6576				es->s_state = cpu_to_le16(sbi->s_mount_state);
6577
6578			if (sbi->s_journal) {
6579				/*
6580				 * We let remount-ro finish even if marking fs
6581				 * as clean failed...
6582				 */
6583				ext4_mark_recovery_complete(sb, es);
6584			}
6585		} else {
6586			/* Make sure we can mount this feature set readwrite */
6587			if (ext4_has_feature_readonly(sb) ||
6588			    !ext4_feature_set_ok(sb, 0)) {
6589				err = -EROFS;
6590				goto restore_opts;
6591			}
6592			/*
6593			 * Make sure the group descriptor checksums
6594			 * are sane.  If they aren't, refuse to remount r/w.
6595			 */
6596			for (g = 0; g < sbi->s_groups_count; g++) {
6597				struct ext4_group_desc *gdp =
6598					ext4_get_group_desc(sb, g, NULL);
6599
6600				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
6601					ext4_msg(sb, KERN_ERR,
6602	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
6603		g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
6604					       le16_to_cpu(gdp->bg_checksum));
6605					err = -EFSBADCRC;
6606					goto restore_opts;
6607				}
6608			}
6609
6610			/*
6611			 * If we have an unprocessed orphan list hanging
6612			 * around from a previously readonly bdev mount,
6613			 * require a full umount/remount for now.
6614			 */
6615			if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
6616				ext4_msg(sb, KERN_WARNING, "Couldn't "
6617				       "remount RDWR because of unprocessed "
6618				       "orphan inode list.  Please "
6619				       "umount/remount instead");
6620				err = -EINVAL;
6621				goto restore_opts;
6622			}
6623
6624			/*
6625			 * Mounting a RDONLY partition read-write, so reread
6626			 * and store the current valid flag.  (It may have
6627			 * been changed by e2fsck since we originally mounted
6628			 * the partition.)
6629			 */
6630			if (sbi->s_journal) {
6631				err = ext4_clear_journal_err(sb, es);
6632				if (err)
6633					goto restore_opts;
6634			}
6635			sbi->s_mount_state = (le16_to_cpu(es->s_state) &
6636					      ~EXT4_FC_REPLAY);
6637
6638			err = ext4_setup_super(sb, es, 0);
6639			if (err)
6640				goto restore_opts;
6641
6642			sb->s_flags &= ~SB_RDONLY;
6643			if (ext4_has_feature_mmp(sb)) {
6644				err = ext4_multi_mount_protect(sb,
6645						le64_to_cpu(es->s_mmp_block));
6646				if (err)
6647					goto restore_opts;
6648			}
6649#ifdef CONFIG_QUOTA
6650			enable_quota = 1;
6651#endif
6652		}
6653	}
6654
6655	/*
6656	 * Handle creation of system zone data early because it can fail.
6657	 * Releasing of existing data is done when we are sure remount will
6658	 * succeed.
6659	 */
6660	if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
6661		err = ext4_setup_system_zone(sb);
6662		if (err)
6663			goto restore_opts;
6664	}
6665
6666	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
6667		err = ext4_commit_super(sb);
6668		if (err)
6669			goto restore_opts;
6670	}
6671
6672#ifdef CONFIG_QUOTA
6673	if (enable_quota) {
6674		if (sb_any_quota_suspended(sb))
6675			dquot_resume(sb, -1);
6676		else if (ext4_has_feature_quota(sb)) {
6677			err = ext4_enable_quotas(sb);
6678			if (err)
6679				goto restore_opts;
6680		}
6681	}
6682	/* Release old quota file names */
6683	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6684		kfree(old_opts.s_qf_names[i]);
6685#endif
6686	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6687		ext4_release_system_zone(sb);
6688
6689	/*
6690	 * Reinitialize lazy itable initialization thread based on
6691	 * current settings
6692	 */
6693	if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
6694		ext4_unregister_li_request(sb);
6695	else {
6696		ext4_group_t first_not_zeroed;
6697		first_not_zeroed = ext4_has_uninit_itable(sb);
6698		ext4_register_li_request(sb, first_not_zeroed);
6699	}
6700
6701	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6702		ext4_stop_mmpd(sbi);
6703
6704	return 0;
6705
6706restore_opts:
6707	/*
6708	 * If there was a failing r/w to ro transition, we may need to
6709	 * re-enable quota
6710	 */
6711	if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
6712	    sb_any_quota_suspended(sb))
6713		dquot_resume(sb, -1);
6714
6715	alloc_ctx = ext4_writepages_down_write(sb);
6716	sb->s_flags = old_sb_flags;
6717	sbi->s_mount_opt = old_opts.s_mount_opt;
6718	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
6719	sbi->s_resuid = old_opts.s_resuid;
6720	sbi->s_resgid = old_opts.s_resgid;
6721	sbi->s_commit_interval = old_opts.s_commit_interval;
6722	sbi->s_min_batch_time = old_opts.s_min_batch_time;
6723	sbi->s_max_batch_time = old_opts.s_max_batch_time;
6724	ext4_writepages_up_write(sb, alloc_ctx);
6725
6726	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6727		ext4_release_system_zone(sb);
6728#ifdef CONFIG_QUOTA
6729	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
6730	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
6731		to_free[i] = get_qf_name(sb, sbi, i);
6732		rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
6733	}
6734	synchronize_rcu();
6735	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6736		kfree(to_free[i]);
6737#endif
6738	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6739		ext4_stop_mmpd(sbi);
6740	return err;
6741}
6742
6743static int ext4_reconfigure(struct fs_context *fc)
6744{
6745	struct super_block *sb = fc->root->d_sb;
6746	int ret;
6747
6748	fc->s_fs_info = EXT4_SB(sb);
6749
6750	ret = ext4_check_opt_consistency(fc, sb);
6751	if (ret < 0)
6752		return ret;
6753
6754	ret = __ext4_remount(fc, sb);
6755	if (ret < 0)
6756		return ret;
6757
6758	ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
6759		 &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
6760		 ext4_quota_mode(sb));
6761
6762	return 0;
6763}
6764
6765#ifdef CONFIG_QUOTA
6766static int ext4_statfs_project(struct super_block *sb,
6767			       kprojid_t projid, struct kstatfs *buf)
6768{
6769	struct kqid qid;
6770	struct dquot *dquot;
6771	u64 limit;
6772	u64 curblock;
6773
6774	qid = make_kqid_projid(projid);
6775	dquot = dqget(sb, qid);
6776	if (IS_ERR(dquot))
6777		return PTR_ERR(dquot);
6778	spin_lock(&dquot->dq_dqb_lock);
6779
6780	limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
6781			     dquot->dq_dqb.dqb_bhardlimit);
6782	limit >>= sb->s_blocksize_bits;
6783
6784	if (limit && buf->f_blocks > limit) {
6785		curblock = (dquot->dq_dqb.dqb_curspace +
6786			    dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
6787		buf->f_blocks = limit;
6788		buf->f_bfree = buf->f_bavail =
6789			(buf->f_blocks > curblock) ?
6790			 (buf->f_blocks - curblock) : 0;
6791	}
6792
6793	limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
6794			     dquot->dq_dqb.dqb_ihardlimit);
6795	if (limit && buf->f_files > limit) {
6796		buf->f_files = limit;
6797		buf->f_ffree =
6798			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
6799			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
6800	}
6801
6802	spin_unlock(&dquot->dq_dqb_lock);
6803	dqput(dquot);
6804	return 0;
6805}
6806#endif
6807
6808static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
6809{
6810	struct super_block *sb = dentry->d_sb;
6811	struct ext4_sb_info *sbi = EXT4_SB(sb);
6812	struct ext4_super_block *es = sbi->s_es;
6813	ext4_fsblk_t overhead = 0, resv_blocks;
6814	s64 bfree;
6815	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
6816
6817	if (!test_opt(sb, MINIX_DF))
6818		overhead = sbi->s_overhead;
6819
6820	buf->f_type = EXT4_SUPER_MAGIC;
6821	buf->f_bsize = sb->s_blocksize;
6822	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
6823	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
6824		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
6825	/* prevent underflow in case that few free space is available */
6826	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
6827	buf->f_bavail = buf->f_bfree -
6828			(ext4_r_blocks_count(es) + resv_blocks);
6829	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
6830		buf->f_bavail = 0;
6831	buf->f_files = le32_to_cpu(es->s_inodes_count);
6832	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
6833	buf->f_namelen = EXT4_NAME_LEN;
6834	buf->f_fsid = uuid_to_fsid(es->s_uuid);
6835
6836#ifdef CONFIG_QUOTA
6837	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
6838	    sb_has_quota_limits_enabled(sb, PRJQUOTA))
6839		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
6840#endif
6841	return 0;
6842}
6843
6844
6845#ifdef CONFIG_QUOTA
6846
6847/*
6848 * Helper functions so that transaction is started before we acquire dqio_sem
6849 * to keep correct lock ordering of transaction > dqio_sem
6850 */
6851static inline struct inode *dquot_to_inode(struct dquot *dquot)
6852{
6853	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
6854}
6855
6856static int ext4_write_dquot(struct dquot *dquot)
6857{
6858	int ret, err;
6859	handle_t *handle;
6860	struct inode *inode;
6861
6862	inode = dquot_to_inode(dquot);
6863	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
6864				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
6865	if (IS_ERR(handle))
6866		return PTR_ERR(handle);
6867	ret = dquot_commit(dquot);
6868	err = ext4_journal_stop(handle);
6869	if (!ret)
6870		ret = err;
6871	return ret;
6872}
6873
6874static int ext4_acquire_dquot(struct dquot *dquot)
6875{
6876	int ret, err;
6877	handle_t *handle;
6878
6879	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6880				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
6881	if (IS_ERR(handle))
6882		return PTR_ERR(handle);
6883	ret = dquot_acquire(dquot);
6884	err = ext4_journal_stop(handle);
6885	if (!ret)
6886		ret = err;
6887	return ret;
6888}
6889
6890static int ext4_release_dquot(struct dquot *dquot)
6891{
6892	int ret, err;
6893	handle_t *handle;
6894
6895	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6896				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
6897	if (IS_ERR(handle)) {
6898		/* Release dquot anyway to avoid endless cycle in dqput() */
6899		dquot_release(dquot);
6900		return PTR_ERR(handle);
6901	}
6902	ret = dquot_release(dquot);
6903	err = ext4_journal_stop(handle);
6904	if (!ret)
6905		ret = err;
6906	return ret;
6907}
6908
6909static int ext4_mark_dquot_dirty(struct dquot *dquot)
6910{
6911	struct super_block *sb = dquot->dq_sb;
6912
6913	if (ext4_is_quota_journalled(sb)) {
6914		dquot_mark_dquot_dirty(dquot);
6915		return ext4_write_dquot(dquot);
6916	} else {
6917		return dquot_mark_dquot_dirty(dquot);
6918	}
6919}
6920
6921static int ext4_write_info(struct super_block *sb, int type)
6922{
6923	int ret, err;
6924	handle_t *handle;
6925
6926	/* Data block + inode block */
6927	handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
6928	if (IS_ERR(handle))
6929		return PTR_ERR(handle);
6930	ret = dquot_commit_info(sb, type);
6931	err = ext4_journal_stop(handle);
6932	if (!ret)
6933		ret = err;
6934	return ret;
6935}
6936
6937static void lockdep_set_quota_inode(struct inode *inode, int subclass)
6938{
6939	struct ext4_inode_info *ei = EXT4_I(inode);
6940
6941	/* The first argument of lockdep_set_subclass has to be
6942	 * *exactly* the same as the argument to init_rwsem() --- in
6943	 * this case, in init_once() --- or lockdep gets unhappy
6944	 * because the name of the lock is set using the
6945	 * stringification of the argument to init_rwsem().
6946	 */
6947	(void) ei;	/* shut up clang warning if !CONFIG_LOCKDEP */
6948	lockdep_set_subclass(&ei->i_data_sem, subclass);
6949}
6950
6951/*
6952 * Standard function to be called on quota_on
6953 */
6954static int ext4_quota_on(struct super_block *sb, int type, int format_id,
6955			 const struct path *path)
6956{
6957	int err;
6958
6959	if (!test_opt(sb, QUOTA))
6960		return -EINVAL;
6961
6962	/* Quotafile not on the same filesystem? */
6963	if (path->dentry->d_sb != sb)
6964		return -EXDEV;
6965
6966	/* Quota already enabled for this file? */
6967	if (IS_NOQUOTA(d_inode(path->dentry)))
6968		return -EBUSY;
6969
6970	/* Journaling quota? */
6971	if (EXT4_SB(sb)->s_qf_names[type]) {
6972		/* Quotafile not in fs root? */
6973		if (path->dentry->d_parent != sb->s_root)
6974			ext4_msg(sb, KERN_WARNING,
6975				"Quota file not on filesystem root. "
6976				"Journaled quota will not work");
6977		sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
6978	} else {
6979		/*
6980		 * Clear the flag just in case mount options changed since
6981		 * last time.
6982		 */
6983		sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
6984	}
6985
6986	lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
6987	err = dquot_quota_on(sb, type, format_id, path);
6988	if (!err) {
6989		struct inode *inode = d_inode(path->dentry);
6990		handle_t *handle;
6991
6992		/*
6993		 * Set inode flags to prevent userspace from messing with quota
6994		 * files. If this fails, we return success anyway since quotas
6995		 * are already enabled and this is not a hard failure.
6996		 */
6997		inode_lock(inode);
6998		handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6999		if (IS_ERR(handle))
7000			goto unlock_inode;
7001		EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
7002		inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
7003				S_NOATIME | S_IMMUTABLE);
7004		err = ext4_mark_inode_dirty(handle, inode);
7005		ext4_journal_stop(handle);
7006	unlock_inode:
7007		inode_unlock(inode);
7008		if (err)
7009			dquot_quota_off(sb, type);
7010	}
7011	if (err)
7012		lockdep_set_quota_inode(path->dentry->d_inode,
7013					     I_DATA_SEM_NORMAL);
7014	return err;
7015}
7016
7017static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
7018{
7019	switch (type) {
7020	case USRQUOTA:
7021		return qf_inum == EXT4_USR_QUOTA_INO;
7022	case GRPQUOTA:
7023		return qf_inum == EXT4_GRP_QUOTA_INO;
7024	case PRJQUOTA:
7025		return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
7026	default:
7027		BUG();
7028	}
7029}
7030
7031static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
7032			     unsigned int flags)
7033{
7034	int err;
7035	struct inode *qf_inode;
7036	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
7037		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
7038		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
7039		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
7040	};
7041
7042	BUG_ON(!ext4_has_feature_quota(sb));
7043
7044	if (!qf_inums[type])
7045		return -EPERM;
7046
7047	if (!ext4_check_quota_inum(type, qf_inums[type])) {
7048		ext4_error(sb, "Bad quota inum: %lu, type: %d",
7049				qf_inums[type], type);
7050		return -EUCLEAN;
7051	}
7052
7053	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
7054	if (IS_ERR(qf_inode)) {
7055		ext4_error(sb, "Bad quota inode: %lu, type: %d",
7056				qf_inums[type], type);
7057		return PTR_ERR(qf_inode);
7058	}
7059
7060	/* Don't account quota for quota files to avoid recursion */
7061	qf_inode->i_flags |= S_NOQUOTA;
7062	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
7063	err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
7064	if (err)
7065		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
7066	iput(qf_inode);
7067
7068	return err;
7069}
7070
7071/* Enable usage tracking for all quota types. */
7072int ext4_enable_quotas(struct super_block *sb)
7073{
7074	int type, err = 0;
7075	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
7076		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
7077		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
7078		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
7079	};
7080	bool quota_mopt[EXT4_MAXQUOTAS] = {
7081		test_opt(sb, USRQUOTA),
7082		test_opt(sb, GRPQUOTA),
7083		test_opt(sb, PRJQUOTA),
7084	};
7085
7086	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
7087	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
7088		if (qf_inums[type]) {
7089			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
7090				DQUOT_USAGE_ENABLED |
7091				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
7092			if (err) {
7093				ext4_warning(sb,
7094					"Failed to enable quota tracking "
7095					"(type=%d, err=%d, ino=%lu). "
7096					"Please run e2fsck to fix.", type,
7097					err, qf_inums[type]);
7098
7099				ext4_quotas_off(sb, type);
7100				return err;
7101			}
7102		}
7103	}
7104	return 0;
7105}
7106
7107static int ext4_quota_off(struct super_block *sb, int type)
7108{
7109	struct inode *inode = sb_dqopt(sb)->files[type];
7110	handle_t *handle;
7111	int err;
7112
7113	/* Force all delayed allocation blocks to be allocated.
7114	 * Caller already holds s_umount sem */
7115	if (test_opt(sb, DELALLOC))
7116		sync_filesystem(sb);
7117
7118	if (!inode || !igrab(inode))
7119		goto out;
7120
7121	err = dquot_quota_off(sb, type);
7122	if (err || ext4_has_feature_quota(sb))
7123		goto out_put;
7124	/*
7125	 * When the filesystem was remounted read-only first, we cannot cleanup
7126	 * inode flags here. Bad luck but people should be using QUOTA feature
7127	 * these days anyway.
7128	 */
7129	if (sb_rdonly(sb))
7130		goto out_put;
7131
7132	inode_lock(inode);
7133	/*
7134	 * Update modification times of quota files when userspace can
7135	 * start looking at them. If we fail, we return success anyway since
7136	 * this is not a hard failure and quotas are already disabled.
7137	 */
7138	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
7139	if (IS_ERR(handle)) {
7140		err = PTR_ERR(handle);
7141		goto out_unlock;
7142	}
7143	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
7144	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
7145	inode->i_mtime = inode_set_ctime_current(inode);
7146	err = ext4_mark_inode_dirty(handle, inode);
7147	ext4_journal_stop(handle);
7148out_unlock:
7149	inode_unlock(inode);
7150out_put:
7151	lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
7152	iput(inode);
7153	return err;
7154out:
7155	return dquot_quota_off(sb, type);
7156}
7157
7158/* Read data from quotafile - avoid pagecache and such because we cannot afford
7159 * acquiring the locks... As quota files are never truncated and quota code
7160 * itself serializes the operations (and no one else should touch the files)
7161 * we don't have to be afraid of races */
7162static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
7163			       size_t len, loff_t off)
7164{
7165	struct inode *inode = sb_dqopt(sb)->files[type];
7166	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
7167	int offset = off & (sb->s_blocksize - 1);
7168	int tocopy;
7169	size_t toread;
7170	struct buffer_head *bh;
7171	loff_t i_size = i_size_read(inode);
7172
7173	if (off > i_size)
7174		return 0;
7175	if (off+len > i_size)
7176		len = i_size-off;
7177	toread = len;
7178	while (toread > 0) {
7179		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
7180		bh = ext4_bread(NULL, inode, blk, 0);
7181		if (IS_ERR(bh))
7182			return PTR_ERR(bh);
7183		if (!bh)	/* A hole? */
7184			memset(data, 0, tocopy);
7185		else
7186			memcpy(data, bh->b_data+offset, tocopy);
7187		brelse(bh);
7188		offset = 0;
7189		toread -= tocopy;
7190		data += tocopy;
7191		blk++;
7192	}
7193	return len;
7194}
7195
7196/* Write to quotafile (we know the transaction is already started and has
7197 * enough credits) */
7198static ssize_t ext4_quota_write(struct super_block *sb, int type,
7199				const char *data, size_t len, loff_t off)
7200{
7201	struct inode *inode = sb_dqopt(sb)->files[type];
7202	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
7203	int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
7204	int retries = 0;
7205	struct buffer_head *bh;
7206	handle_t *handle = journal_current_handle();
7207
7208	if (!handle) {
7209		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
7210			" cancelled because transaction is not started",
7211			(unsigned long long)off, (unsigned long long)len);
7212		return -EIO;
7213	}
7214	/*
7215	 * Since we account only one data block in transaction credits,
7216	 * then it is impossible to cross a block boundary.
7217	 */
7218	if (sb->s_blocksize - offset < len) {
7219		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
7220			" cancelled because not block aligned",
7221			(unsigned long long)off, (unsigned long long)len);
7222		return -EIO;
7223	}
7224
7225	do {
7226		bh = ext4_bread(handle, inode, blk,
7227				EXT4_GET_BLOCKS_CREATE |
7228				EXT4_GET_BLOCKS_METADATA_NOFAIL);
7229	} while (PTR_ERR(bh) == -ENOSPC &&
7230		 ext4_should_retry_alloc(inode->i_sb, &retries));
7231	if (IS_ERR(bh))
7232		return PTR_ERR(bh);
7233	if (!bh)
7234		goto out;
7235	BUFFER_TRACE(bh, "get write access");
7236	err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
7237	if (err) {
7238		brelse(bh);
7239		return err;
7240	}
7241	lock_buffer(bh);
7242	memcpy(bh->b_data+offset, data, len);
7243	flush_dcache_page(bh->b_page);
7244	unlock_buffer(bh);
7245	err = ext4_handle_dirty_metadata(handle, NULL, bh);
7246	brelse(bh);
7247out:
7248	if (inode->i_size < off + len) {
7249		i_size_write(inode, off + len);
7250		EXT4_I(inode)->i_disksize = inode->i_size;
7251		err2 = ext4_mark_inode_dirty(handle, inode);
7252		if (unlikely(err2 && !err))
7253			err = err2;
7254	}
7255	return err ? err : len;
7256}
7257#endif
7258
7259#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
7260static inline void register_as_ext2(void)
7261{
7262	int err = register_filesystem(&ext2_fs_type);
7263	if (err)
7264		printk(KERN_WARNING
7265		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
7266}
7267
7268static inline void unregister_as_ext2(void)
7269{
7270	unregister_filesystem(&ext2_fs_type);
7271}
7272
7273static inline int ext2_feature_set_ok(struct super_block *sb)
7274{
7275	if (ext4_has_unknown_ext2_incompat_features(sb))
7276		return 0;
7277	if (sb_rdonly(sb))
7278		return 1;
7279	if (ext4_has_unknown_ext2_ro_compat_features(sb))
7280		return 0;
7281	return 1;
7282}
7283#else
7284static inline void register_as_ext2(void) { }
7285static inline void unregister_as_ext2(void) { }
7286static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
7287#endif
7288
7289static inline void register_as_ext3(void)
7290{
7291	int err = register_filesystem(&ext3_fs_type);
7292	if (err)
7293		printk(KERN_WARNING
7294		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
7295}
7296
7297static inline void unregister_as_ext3(void)
7298{
7299	unregister_filesystem(&ext3_fs_type);
7300}
7301
7302static inline int ext3_feature_set_ok(struct super_block *sb)
7303{
7304	if (ext4_has_unknown_ext3_incompat_features(sb))
7305		return 0;
7306	if (!ext4_has_feature_journal(sb))
7307		return 0;
7308	if (sb_rdonly(sb))
7309		return 1;
7310	if (ext4_has_unknown_ext3_ro_compat_features(sb))
7311		return 0;
7312	return 1;
7313}
7314
7315static void ext4_kill_sb(struct super_block *sb)
7316{
7317	struct ext4_sb_info *sbi = EXT4_SB(sb);
7318	struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
7319
7320	kill_block_super(sb);
7321
7322	if (journal_bdev)
7323		blkdev_put(journal_bdev, sb);
7324}
7325
7326static struct file_system_type ext4_fs_type = {
7327	.owner			= THIS_MODULE,
7328	.name			= "ext4",
7329	.init_fs_context	= ext4_init_fs_context,
7330	.parameters		= ext4_param_specs,
7331	.kill_sb		= ext4_kill_sb,
7332	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
7333};
7334MODULE_ALIAS_FS("ext4");
7335
7336/* Shared across all ext4 file systems */
7337wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
7338
7339static int __init ext4_init_fs(void)
7340{
7341	int i, err;
7342
7343	ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
7344	ext4_li_info = NULL;
7345
7346	/* Build-time check for flags consistency */
7347	ext4_check_flag_values();
7348
7349	for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
7350		init_waitqueue_head(&ext4__ioend_wq[i]);
7351
7352	err = ext4_init_es();
7353	if (err)
7354		return err;
7355
7356	err = ext4_init_pending();
7357	if (err)
7358		goto out7;
7359
7360	err = ext4_init_post_read_processing();
7361	if (err)
7362		goto out6;
7363
7364	err = ext4_init_pageio();
7365	if (err)
7366		goto out5;
7367
7368	err = ext4_init_system_zone();
7369	if (err)
7370		goto out4;
7371
7372	err = ext4_init_sysfs();
7373	if (err)
7374		goto out3;
7375
7376	err = ext4_init_mballoc();
7377	if (err)
7378		goto out2;
7379	err = init_inodecache();
7380	if (err)
7381		goto out1;
7382
7383	err = ext4_fc_init_dentry_cache();
7384	if (err)
7385		goto out05;
7386
7387	register_as_ext3();
7388	register_as_ext2();
7389	err = register_filesystem(&ext4_fs_type);
7390	if (err)
7391		goto out;
7392
7393	return 0;
7394out:
7395	unregister_as_ext2();
7396	unregister_as_ext3();
7397	ext4_fc_destroy_dentry_cache();
7398out05:
7399	destroy_inodecache();
7400out1:
7401	ext4_exit_mballoc();
7402out2:
7403	ext4_exit_sysfs();
7404out3:
7405	ext4_exit_system_zone();
7406out4:
7407	ext4_exit_pageio();
7408out5:
7409	ext4_exit_post_read_processing();
7410out6:
7411	ext4_exit_pending();
7412out7:
7413	ext4_exit_es();
7414
7415	return err;
7416}
7417
7418static void __exit ext4_exit_fs(void)
7419{
7420	ext4_destroy_lazyinit_thread();
7421	unregister_as_ext2();
7422	unregister_as_ext3();
7423	unregister_filesystem(&ext4_fs_type);
7424	ext4_fc_destroy_dentry_cache();
7425	destroy_inodecache();
7426	ext4_exit_mballoc();
7427	ext4_exit_sysfs();
7428	ext4_exit_system_zone();
7429	ext4_exit_pageio();
7430	ext4_exit_post_read_processing();
7431	ext4_exit_es();
7432	ext4_exit_pending();
7433}
7434
7435MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
7436MODULE_DESCRIPTION("Fourth Extended Filesystem");
7437MODULE_LICENSE("GPL");
7438MODULE_SOFTDEP("pre: crc32c");
7439module_init(ext4_init_fs)
7440module_exit(ext4_exit_fs)
7441