1// SPDX-License-Identifier: GPL-2.0 2/* 3 * linux/fs/ext4/super.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/inode.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * Big-endian to little-endian byte-swapping/bitmaps by 17 * David S. Miller (davem@caip.rutgers.edu), 1995 18 */ 19 20#include <linux/module.h> 21#include <linux/string.h> 22#include <linux/fs.h> 23#include <linux/time.h> 24#include <linux/vmalloc.h> 25#include <linux/slab.h> 26#include <linux/init.h> 27#include <linux/blkdev.h> 28#include <linux/backing-dev.h> 29#include <linux/parser.h> 30#include <linux/buffer_head.h> 31#include <linux/exportfs.h> 32#include <linux/vfs.h> 33#include <linux/random.h> 34#include <linux/mount.h> 35#include <linux/namei.h> 36#include <linux/quotaops.h> 37#include <linux/seq_file.h> 38#include <linux/ctype.h> 39#include <linux/log2.h> 40#include <linux/crc16.h> 41#include <linux/dax.h> 42#include <linux/cleancache.h> 43#include <linux/uaccess.h> 44#include <linux/iversion.h> 45#include <linux/unicode.h> 46#include <linux/part_stat.h> 47#include <linux/kthread.h> 48#include <linux/freezer.h> 49 50#include "ext4.h" 51#include "ext4_extents.h" /* Needed for trace points definition */ 52#include "ext4_jbd2.h" 53#include "xattr.h" 54#include "acl.h" 55#include "mballoc.h" 56#include "fsmap.h" 57 58#define CREATE_TRACE_POINTS 59#include <trace/events/ext4.h> 60 61static struct ext4_lazy_init *ext4_li_info; 62static struct mutex ext4_li_mtx; 63static struct ratelimit_state ext4_mount_msg_ratelimit; 64 65static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 66 unsigned long journal_devnum); 67static int ext4_show_options(struct seq_file *seq, struct dentry *root); 68static void ext4_update_super(struct super_block *sb); 69static int ext4_commit_super(struct super_block *sb); 70static int ext4_mark_recovery_complete(struct super_block *sb, 71 struct ext4_super_block *es); 72static int ext4_clear_journal_err(struct super_block *sb, 73 struct ext4_super_block *es); 74static int ext4_sync_fs(struct super_block *sb, int wait); 75static int ext4_remount(struct super_block *sb, int *flags, char *data); 76static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 77static int ext4_unfreeze(struct super_block *sb); 78static int ext4_freeze(struct super_block *sb); 79static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 80 const char *dev_name, void *data); 81static inline int ext2_feature_set_ok(struct super_block *sb); 82static inline int ext3_feature_set_ok(struct super_block *sb); 83static int ext4_feature_set_ok(struct super_block *sb, int readonly); 84static void ext4_destroy_lazyinit_thread(void); 85static void ext4_unregister_li_request(struct super_block *sb); 86static void ext4_clear_request_list(void); 87static struct inode *ext4_get_journal_inode(struct super_block *sb, 88 unsigned int journal_inum); 89 90/* 91 * Lock ordering 92 * 93 * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and 94 * i_mmap_rwsem (inode->i_mmap_rwsem)! 95 * 96 * page fault path: 97 * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> 98 * page lock -> i_data_sem (rw) 99 * 100 * buffered write path: 101 * sb_start_write -> i_mutex -> mmap_lock 102 * sb_start_write -> i_mutex -> transaction start -> page lock -> 103 * i_data_sem (rw) 104 * 105 * truncate: 106 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock 107 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> 108 * i_data_sem (rw) 109 * 110 * direct IO: 111 * sb_start_write -> i_mutex -> mmap_lock 112 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) 113 * 114 * writepages: 115 * transaction start -> page lock(s) -> i_data_sem (rw) 116 */ 117 118#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) 119static struct file_system_type ext2_fs_type = { 120 .owner = THIS_MODULE, 121 .name = "ext2", 122 .mount = ext4_mount, 123 .kill_sb = kill_block_super, 124 .fs_flags = FS_REQUIRES_DEV, 125}; 126MODULE_ALIAS_FS("ext2"); 127MODULE_ALIAS("ext2"); 128#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) 129#else 130#define IS_EXT2_SB(sb) (0) 131#endif 132 133 134static struct file_system_type ext3_fs_type = { 135 .owner = THIS_MODULE, 136 .name = "ext3", 137 .mount = ext4_mount, 138 .kill_sb = kill_block_super, 139 .fs_flags = FS_REQUIRES_DEV, 140}; 141MODULE_ALIAS_FS("ext3"); 142MODULE_ALIAS("ext3"); 143#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) 144 145 146static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, 147 bh_end_io_t *end_io) 148{ 149 /* 150 * buffer's verified bit is no longer valid after reading from 151 * disk again due to write out error, clear it to make sure we 152 * recheck the buffer contents. 153 */ 154 clear_buffer_verified(bh); 155 156 bh->b_end_io = end_io ? end_io : end_buffer_read_sync; 157 get_bh(bh); 158 submit_bh(REQ_OP_READ, op_flags, bh); 159} 160 161void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, 162 bh_end_io_t *end_io) 163{ 164 BUG_ON(!buffer_locked(bh)); 165 166 if (ext4_buffer_uptodate(bh)) { 167 unlock_buffer(bh); 168 return; 169 } 170 __ext4_read_bh(bh, op_flags, end_io); 171} 172 173int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) 174{ 175 BUG_ON(!buffer_locked(bh)); 176 177 if (ext4_buffer_uptodate(bh)) { 178 unlock_buffer(bh); 179 return 0; 180 } 181 182 __ext4_read_bh(bh, op_flags, end_io); 183 184 wait_on_buffer(bh); 185 if (buffer_uptodate(bh)) 186 return 0; 187 return -EIO; 188} 189 190int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) 191{ 192 lock_buffer(bh); 193 if (!wait) { 194 ext4_read_bh_nowait(bh, op_flags, NULL); 195 return 0; 196 } 197 return ext4_read_bh(bh, op_flags, NULL); 198} 199 200/* 201 * This works like __bread_gfp() except it uses ERR_PTR for error 202 * returns. Currently with sb_bread it's impossible to distinguish 203 * between ENOMEM and EIO situations (since both result in a NULL 204 * return. 205 */ 206static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, 207 sector_t block, int op_flags, 208 gfp_t gfp) 209{ 210 struct buffer_head *bh; 211 int ret; 212 213 bh = sb_getblk_gfp(sb, block, gfp); 214 if (bh == NULL) 215 return ERR_PTR(-ENOMEM); 216 if (ext4_buffer_uptodate(bh)) 217 return bh; 218 219 ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); 220 if (ret) { 221 put_bh(bh); 222 return ERR_PTR(ret); 223 } 224 return bh; 225} 226 227struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, 228 int op_flags) 229{ 230 return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); 231} 232 233struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, 234 sector_t block) 235{ 236 return __ext4_sb_bread_gfp(sb, block, 0, 0); 237} 238 239void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) 240{ 241 struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); 242 243 if (likely(bh)) { 244 if (trylock_buffer(bh)) 245 ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); 246 brelse(bh); 247 } 248} 249 250static int ext4_verify_csum_type(struct super_block *sb, 251 struct ext4_super_block *es) 252{ 253 if (!ext4_has_feature_metadata_csum(sb)) 254 return 1; 255 256 return es->s_checksum_type == EXT4_CRC32C_CHKSUM; 257} 258 259static __le32 ext4_superblock_csum(struct super_block *sb, 260 struct ext4_super_block *es) 261{ 262 struct ext4_sb_info *sbi = EXT4_SB(sb); 263 int offset = offsetof(struct ext4_super_block, s_checksum); 264 __u32 csum; 265 266 csum = ext4_chksum(sbi, ~0, (char *)es, offset); 267 268 return cpu_to_le32(csum); 269} 270 271static int ext4_superblock_csum_verify(struct super_block *sb, 272 struct ext4_super_block *es) 273{ 274 if (!ext4_has_metadata_csum(sb)) 275 return 1; 276 277 return es->s_checksum == ext4_superblock_csum(sb, es); 278} 279 280void ext4_superblock_csum_set(struct super_block *sb) 281{ 282 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 283 284 if (!ext4_has_metadata_csum(sb)) 285 return; 286 287 es->s_checksum = ext4_superblock_csum(sb, es); 288} 289 290ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 291 struct ext4_group_desc *bg) 292{ 293 return le32_to_cpu(bg->bg_block_bitmap_lo) | 294 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 295 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); 296} 297 298ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, 299 struct ext4_group_desc *bg) 300{ 301 return le32_to_cpu(bg->bg_inode_bitmap_lo) | 302 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 303 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); 304} 305 306ext4_fsblk_t ext4_inode_table(struct super_block *sb, 307 struct ext4_group_desc *bg) 308{ 309 return le32_to_cpu(bg->bg_inode_table_lo) | 310 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 311 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 312} 313 314__u32 ext4_free_group_clusters(struct super_block *sb, 315 struct ext4_group_desc *bg) 316{ 317 return le16_to_cpu(bg->bg_free_blocks_count_lo) | 318 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 319 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); 320} 321 322__u32 ext4_free_inodes_count(struct super_block *sb, 323 struct ext4_group_desc *bg) 324{ 325 return le16_to_cpu(bg->bg_free_inodes_count_lo) | 326 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 327 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); 328} 329 330__u32 ext4_used_dirs_count(struct super_block *sb, 331 struct ext4_group_desc *bg) 332{ 333 return le16_to_cpu(bg->bg_used_dirs_count_lo) | 334 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 335 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); 336} 337 338__u32 ext4_itable_unused_count(struct super_block *sb, 339 struct ext4_group_desc *bg) 340{ 341 return le16_to_cpu(bg->bg_itable_unused_lo) | 342 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 343 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); 344} 345 346void ext4_block_bitmap_set(struct super_block *sb, 347 struct ext4_group_desc *bg, ext4_fsblk_t blk) 348{ 349 bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); 350 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 351 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); 352} 353 354void ext4_inode_bitmap_set(struct super_block *sb, 355 struct ext4_group_desc *bg, ext4_fsblk_t blk) 356{ 357 bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); 358 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 359 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); 360} 361 362void ext4_inode_table_set(struct super_block *sb, 363 struct ext4_group_desc *bg, ext4_fsblk_t blk) 364{ 365 bg->bg_inode_table_lo = cpu_to_le32((u32)blk); 366 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 367 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 368} 369 370void ext4_free_group_clusters_set(struct super_block *sb, 371 struct ext4_group_desc *bg, __u32 count) 372{ 373 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); 374 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 375 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); 376} 377 378void ext4_free_inodes_set(struct super_block *sb, 379 struct ext4_group_desc *bg, __u32 count) 380{ 381 bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); 382 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 383 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); 384} 385 386void ext4_used_dirs_set(struct super_block *sb, 387 struct ext4_group_desc *bg, __u32 count) 388{ 389 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); 390 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 391 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); 392} 393 394void ext4_itable_unused_set(struct super_block *sb, 395 struct ext4_group_desc *bg, __u32 count) 396{ 397 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); 398 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 399 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 400} 401 402static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) 403{ 404 now = clamp_val(now, 0, (1ull << 40) - 1); 405 406 *lo = cpu_to_le32(lower_32_bits(now)); 407 *hi = upper_32_bits(now); 408} 409 410static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) 411{ 412 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); 413} 414#define ext4_update_tstamp(es, tstamp) \ 415 __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ 416 ktime_get_real_seconds()) 417#define ext4_get_tstamp(es, tstamp) \ 418 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 419 420/* 421 * The del_gendisk() function uninitializes the disk-specific data 422 * structures, including the bdi structure, without telling anyone 423 * else. Once this happens, any attempt to call mark_buffer_dirty() 424 * (for example, by ext4_commit_super), will cause a kernel OOPS. 425 * This is a kludge to prevent these oops until we can put in a proper 426 * hook in del_gendisk() to inform the VFS and file system layers. 427 */ 428static int block_device_ejected(struct super_block *sb) 429{ 430 struct inode *bd_inode = sb->s_bdev->bd_inode; 431 struct backing_dev_info *bdi = inode_to_bdi(bd_inode); 432 433 return bdi->dev == NULL; 434} 435 436static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) 437{ 438 struct super_block *sb = journal->j_private; 439 struct ext4_sb_info *sbi = EXT4_SB(sb); 440 int error = is_journal_aborted(journal); 441 struct ext4_journal_cb_entry *jce; 442 443 BUG_ON(txn->t_state == T_FINISHED); 444 445 ext4_process_freed_data(sb, txn->t_tid); 446 447 spin_lock(&sbi->s_md_lock); 448 while (!list_empty(&txn->t_private_list)) { 449 jce = list_entry(txn->t_private_list.next, 450 struct ext4_journal_cb_entry, jce_list); 451 list_del_init(&jce->jce_list); 452 spin_unlock(&sbi->s_md_lock); 453 jce->jce_func(sb, jce, error); 454 spin_lock(&sbi->s_md_lock); 455 } 456 spin_unlock(&sbi->s_md_lock); 457} 458 459/* 460 * This writepage callback for write_cache_pages() 461 * takes care of a few cases after page cleaning. 462 * 463 * write_cache_pages() already checks for dirty pages 464 * and calls clear_page_dirty_for_io(), which we want, 465 * to write protect the pages. 466 * 467 * However, we may have to redirty a page (see below.) 468 */ 469static int ext4_journalled_writepage_callback(struct page *page, 470 struct writeback_control *wbc, 471 void *data) 472{ 473 transaction_t *transaction = (transaction_t *) data; 474 struct buffer_head *bh, *head; 475 struct journal_head *jh; 476 477 bh = head = page_buffers(page); 478 do { 479 /* 480 * We have to redirty a page in these cases: 481 * 1) If buffer is dirty, it means the page was dirty because it 482 * contains a buffer that needs checkpointing. So the dirty bit 483 * needs to be preserved so that checkpointing writes the buffer 484 * properly. 485 * 2) If buffer is not part of the committing transaction 486 * (we may have just accidentally come across this buffer because 487 * inode range tracking is not exact) or if the currently running 488 * transaction already contains this buffer as well, dirty bit 489 * needs to be preserved so that the buffer gets writeprotected 490 * properly on running transaction's commit. 491 */ 492 jh = bh2jh(bh); 493 if (buffer_dirty(bh) || 494 (jh && (jh->b_transaction != transaction || 495 jh->b_next_transaction))) { 496 redirty_page_for_writepage(wbc, page); 497 goto out; 498 } 499 } while ((bh = bh->b_this_page) != head); 500 501out: 502 return AOP_WRITEPAGE_ACTIVATE; 503} 504 505static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) 506{ 507 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 508 struct writeback_control wbc = { 509 .sync_mode = WB_SYNC_ALL, 510 .nr_to_write = LONG_MAX, 511 .range_start = jinode->i_dirty_start, 512 .range_end = jinode->i_dirty_end, 513 }; 514 515 return write_cache_pages(mapping, &wbc, 516 ext4_journalled_writepage_callback, 517 jinode->i_transaction); 518} 519 520static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) 521{ 522 int ret; 523 524 if (ext4_should_journal_data(jinode->i_vfs_inode)) 525 ret = ext4_journalled_submit_inode_data_buffers(jinode); 526 else 527 ret = jbd2_journal_submit_inode_data_buffers(jinode); 528 529 return ret; 530} 531 532static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 533{ 534 int ret = 0; 535 536 if (!ext4_should_journal_data(jinode->i_vfs_inode)) 537 ret = jbd2_journal_finish_inode_data_buffers(jinode); 538 539 return ret; 540} 541 542static bool system_going_down(void) 543{ 544 return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF 545 || system_state == SYSTEM_RESTART; 546} 547 548struct ext4_err_translation { 549 int code; 550 int errno; 551}; 552 553#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } 554 555static struct ext4_err_translation err_translation[] = { 556 EXT4_ERR_TRANSLATE(EIO), 557 EXT4_ERR_TRANSLATE(ENOMEM), 558 EXT4_ERR_TRANSLATE(EFSBADCRC), 559 EXT4_ERR_TRANSLATE(EFSCORRUPTED), 560 EXT4_ERR_TRANSLATE(ENOSPC), 561 EXT4_ERR_TRANSLATE(ENOKEY), 562 EXT4_ERR_TRANSLATE(EROFS), 563 EXT4_ERR_TRANSLATE(EFBIG), 564 EXT4_ERR_TRANSLATE(EEXIST), 565 EXT4_ERR_TRANSLATE(ERANGE), 566 EXT4_ERR_TRANSLATE(EOVERFLOW), 567 EXT4_ERR_TRANSLATE(EBUSY), 568 EXT4_ERR_TRANSLATE(ENOTDIR), 569 EXT4_ERR_TRANSLATE(ENOTEMPTY), 570 EXT4_ERR_TRANSLATE(ESHUTDOWN), 571 EXT4_ERR_TRANSLATE(EFAULT), 572}; 573 574static int ext4_errno_to_code(int errno) 575{ 576 int i; 577 578 for (i = 0; i < ARRAY_SIZE(err_translation); i++) 579 if (err_translation[i].errno == errno) 580 return err_translation[i].code; 581 return EXT4_ERR_UNKNOWN; 582} 583 584static void save_error_info(struct super_block *sb, int error, 585 __u32 ino, __u64 block, 586 const char *func, unsigned int line) 587{ 588 struct ext4_sb_info *sbi = EXT4_SB(sb); 589 590 /* We default to EFSCORRUPTED error... */ 591 if (error == 0) 592 error = EFSCORRUPTED; 593 594 spin_lock(&sbi->s_error_lock); 595 sbi->s_add_error_count++; 596 sbi->s_last_error_code = error; 597 sbi->s_last_error_line = line; 598 sbi->s_last_error_ino = ino; 599 sbi->s_last_error_block = block; 600 sbi->s_last_error_func = func; 601 sbi->s_last_error_time = ktime_get_real_seconds(); 602 if (!sbi->s_first_error_time) { 603 sbi->s_first_error_code = error; 604 sbi->s_first_error_line = line; 605 sbi->s_first_error_ino = ino; 606 sbi->s_first_error_block = block; 607 sbi->s_first_error_func = func; 608 sbi->s_first_error_time = sbi->s_last_error_time; 609 } 610 spin_unlock(&sbi->s_error_lock); 611} 612 613/* Deal with the reporting of failure conditions on a filesystem such as 614 * inconsistencies detected or read IO failures. 615 * 616 * On ext2, we can store the error state of the filesystem in the 617 * superblock. That is not possible on ext4, because we may have other 618 * write ordering constraints on the superblock which prevent us from 619 * writing it out straight away; and given that the journal is about to 620 * be aborted, we can't rely on the current, or future, transactions to 621 * write out the superblock safely. 622 * 623 * We'll just use the jbd2_journal_abort() error code to record an error in 624 * the journal instead. On recovery, the journal will complain about 625 * that error until we've noted it down and cleared it. 626 * 627 * If force_ro is set, we unconditionally force the filesystem into an 628 * ABORT|READONLY state, unless the error response on the fs has been set to 629 * panic in which case we take the easy way out and panic immediately. This is 630 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM 631 * at a critical moment in log management. 632 */ 633static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, 634 __u32 ino, __u64 block, 635 const char *func, unsigned int line) 636{ 637 journal_t *journal = EXT4_SB(sb)->s_journal; 638 bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); 639 640 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 641 if (test_opt(sb, WARN_ON_ERROR)) 642 WARN_ON_ONCE(1); 643 644 if (!continue_fs && !sb_rdonly(sb)) { 645 ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 646 if (journal) 647 jbd2_journal_abort(journal, -EIO); 648 } 649 650 if (!bdev_read_only(sb->s_bdev)) { 651 save_error_info(sb, error, ino, block, func, line); 652 /* 653 * In case the fs should keep running, we need to writeout 654 * superblock through the journal. Due to lock ordering 655 * constraints, it may not be safe to do it right here so we 656 * defer superblock flushing to a workqueue. 657 */ 658 if (continue_fs && journal) 659 schedule_work(&EXT4_SB(sb)->s_error_work); 660 else 661 ext4_commit_super(sb); 662 } 663 664 /* 665 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 666 * could panic during 'reboot -f' as the underlying device got already 667 * disabled. 668 */ 669 if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { 670 panic("EXT4-fs (device %s): panic forced after error\n", 671 sb->s_id); 672 } 673 674 if (sb_rdonly(sb) || continue_fs) 675 return; 676 677 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 678 /* 679 * Make sure updated value of ->s_mount_flags will be visible before 680 * ->s_flags update 681 */ 682 smp_wmb(); 683 sb->s_flags |= SB_RDONLY; 684} 685 686static void flush_stashed_error_work(struct work_struct *work) 687{ 688 struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, 689 s_error_work); 690 journal_t *journal = sbi->s_journal; 691 handle_t *handle; 692 693 /* 694 * If the journal is still running, we have to write out superblock 695 * through the journal to avoid collisions of other journalled sb 696 * updates. 697 * 698 * We use directly jbd2 functions here to avoid recursing back into 699 * ext4 error handling code during handling of previous errors. 700 */ 701 if (!sb_rdonly(sbi->s_sb) && journal) { 702 struct buffer_head *sbh = sbi->s_sbh; 703 handle = jbd2_journal_start(journal, 1); 704 if (IS_ERR(handle)) 705 goto write_directly; 706 if (jbd2_journal_get_write_access(handle, sbh)) { 707 jbd2_journal_stop(handle); 708 goto write_directly; 709 } 710 ext4_update_super(sbi->s_sb); 711 if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { 712 ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " 713 "superblock detected"); 714 clear_buffer_write_io_error(sbh); 715 set_buffer_uptodate(sbh); 716 } 717 718 if (jbd2_journal_dirty_metadata(handle, sbh)) { 719 jbd2_journal_stop(handle); 720 goto write_directly; 721 } 722 jbd2_journal_stop(handle); 723 return; 724 } 725write_directly: 726 /* 727 * Write through journal failed. Write sb directly to get error info 728 * out and hope for the best. 729 */ 730 ext4_commit_super(sbi->s_sb); 731} 732 733#define ext4_error_ratelimit(sb) \ 734 ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ 735 "EXT4-fs error") 736 737void __ext4_error(struct super_block *sb, const char *function, 738 unsigned int line, bool force_ro, int error, __u64 block, 739 const char *fmt, ...) 740{ 741 struct va_format vaf; 742 va_list args; 743 744 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 745 return; 746 747 trace_ext4_error(sb, function, line); 748 if (ext4_error_ratelimit(sb)) { 749 va_start(args, fmt); 750 vaf.fmt = fmt; 751 vaf.va = &args; 752 printk(KERN_CRIT 753 "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", 754 sb->s_id, function, line, current->comm, &vaf); 755 va_end(args); 756 } 757 ext4_handle_error(sb, force_ro, error, 0, block, function, line); 758} 759 760void __ext4_error_inode(struct inode *inode, const char *function, 761 unsigned int line, ext4_fsblk_t block, int error, 762 const char *fmt, ...) 763{ 764 va_list args; 765 struct va_format vaf; 766 767 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 768 return; 769 770 trace_ext4_error(inode->i_sb, function, line); 771 if (ext4_error_ratelimit(inode->i_sb)) { 772 va_start(args, fmt); 773 vaf.fmt = fmt; 774 vaf.va = &args; 775 if (block) 776 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " 777 "inode #%lu: block %llu: comm %s: %pV\n", 778 inode->i_sb->s_id, function, line, inode->i_ino, 779 block, current->comm, &vaf); 780 else 781 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " 782 "inode #%lu: comm %s: %pV\n", 783 inode->i_sb->s_id, function, line, inode->i_ino, 784 current->comm, &vaf); 785 va_end(args); 786 } 787 ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, 788 function, line); 789} 790 791void __ext4_error_file(struct file *file, const char *function, 792 unsigned int line, ext4_fsblk_t block, 793 const char *fmt, ...) 794{ 795 va_list args; 796 struct va_format vaf; 797 struct inode *inode = file_inode(file); 798 char pathname[80], *path; 799 800 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 801 return; 802 803 trace_ext4_error(inode->i_sb, function, line); 804 if (ext4_error_ratelimit(inode->i_sb)) { 805 path = file_path(file, pathname, sizeof(pathname)); 806 if (IS_ERR(path)) 807 path = "(unknown)"; 808 va_start(args, fmt); 809 vaf.fmt = fmt; 810 vaf.va = &args; 811 if (block) 812 printk(KERN_CRIT 813 "EXT4-fs error (device %s): %s:%d: inode #%lu: " 814 "block %llu: comm %s: path %s: %pV\n", 815 inode->i_sb->s_id, function, line, inode->i_ino, 816 block, current->comm, path, &vaf); 817 else 818 printk(KERN_CRIT 819 "EXT4-fs error (device %s): %s:%d: inode #%lu: " 820 "comm %s: path %s: %pV\n", 821 inode->i_sb->s_id, function, line, inode->i_ino, 822 current->comm, path, &vaf); 823 va_end(args); 824 } 825 ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, 826 function, line); 827} 828 829const char *ext4_decode_error(struct super_block *sb, int errno, 830 char nbuf[16]) 831{ 832 char *errstr = NULL; 833 834 switch (errno) { 835 case -EFSCORRUPTED: 836 errstr = "Corrupt filesystem"; 837 break; 838 case -EFSBADCRC: 839 errstr = "Filesystem failed CRC"; 840 break; 841 case -EIO: 842 errstr = "IO failure"; 843 break; 844 case -ENOMEM: 845 errstr = "Out of memory"; 846 break; 847 case -EROFS: 848 if (!sb || (EXT4_SB(sb)->s_journal && 849 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) 850 errstr = "Journal has aborted"; 851 else 852 errstr = "Readonly filesystem"; 853 break; 854 default: 855 /* If the caller passed in an extra buffer for unknown 856 * errors, textualise them now. Else we just return 857 * NULL. */ 858 if (nbuf) { 859 /* Check for truncated error codes... */ 860 if (snprintf(nbuf, 16, "error %d", -errno) >= 0) 861 errstr = nbuf; 862 } 863 break; 864 } 865 866 return errstr; 867} 868 869/* __ext4_std_error decodes expected errors from journaling functions 870 * automatically and invokes the appropriate error response. */ 871 872void __ext4_std_error(struct super_block *sb, const char *function, 873 unsigned int line, int errno) 874{ 875 char nbuf[16]; 876 const char *errstr; 877 878 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 879 return; 880 881 /* Special case: if the error is EROFS, and we're not already 882 * inside a transaction, then there's really no point in logging 883 * an error. */ 884 if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) 885 return; 886 887 if (ext4_error_ratelimit(sb)) { 888 errstr = ext4_decode_error(sb, errno, nbuf); 889 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", 890 sb->s_id, function, line, errstr); 891 } 892 893 ext4_handle_error(sb, false, -errno, 0, 0, function, line); 894} 895 896void __ext4_msg(struct super_block *sb, 897 const char *prefix, const char *fmt, ...) 898{ 899 struct va_format vaf; 900 va_list args; 901 902 atomic_inc(&EXT4_SB(sb)->s_msg_count); 903 if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) 904 return; 905 906 va_start(args, fmt); 907 vaf.fmt = fmt; 908 vaf.va = &args; 909 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); 910 va_end(args); 911} 912 913static int ext4_warning_ratelimit(struct super_block *sb) 914{ 915 atomic_inc(&EXT4_SB(sb)->s_warning_count); 916 return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), 917 "EXT4-fs warning"); 918} 919 920void __ext4_warning(struct super_block *sb, const char *function, 921 unsigned int line, const char *fmt, ...) 922{ 923 struct va_format vaf; 924 va_list args; 925 926 if (!ext4_warning_ratelimit(sb)) 927 return; 928 929 va_start(args, fmt); 930 vaf.fmt = fmt; 931 vaf.va = &args; 932 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", 933 sb->s_id, function, line, &vaf); 934 va_end(args); 935} 936 937void __ext4_warning_inode(const struct inode *inode, const char *function, 938 unsigned int line, const char *fmt, ...) 939{ 940 struct va_format vaf; 941 va_list args; 942 943 if (!ext4_warning_ratelimit(inode->i_sb)) 944 return; 945 946 va_start(args, fmt); 947 vaf.fmt = fmt; 948 vaf.va = &args; 949 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " 950 "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, 951 function, line, inode->i_ino, current->comm, &vaf); 952 va_end(args); 953} 954 955void __ext4_grp_locked_error(const char *function, unsigned int line, 956 struct super_block *sb, ext4_group_t grp, 957 unsigned long ino, ext4_fsblk_t block, 958 const char *fmt, ...) 959__releases(bitlock) 960__acquires(bitlock) 961{ 962 struct va_format vaf; 963 va_list args; 964 965 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 966 return; 967 968 trace_ext4_error(sb, function, line); 969 if (ext4_error_ratelimit(sb)) { 970 va_start(args, fmt); 971 vaf.fmt = fmt; 972 vaf.va = &args; 973 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", 974 sb->s_id, function, line, grp); 975 if (ino) 976 printk(KERN_CONT "inode %lu: ", ino); 977 if (block) 978 printk(KERN_CONT "block %llu:", 979 (unsigned long long) block); 980 printk(KERN_CONT "%pV\n", &vaf); 981 va_end(args); 982 } 983 984 if (test_opt(sb, ERRORS_CONT)) { 985 if (test_opt(sb, WARN_ON_ERROR)) 986 WARN_ON_ONCE(1); 987 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 988 if (!bdev_read_only(sb->s_bdev)) { 989 save_error_info(sb, EFSCORRUPTED, ino, block, function, 990 line); 991 schedule_work(&EXT4_SB(sb)->s_error_work); 992 } 993 return; 994 } 995 ext4_unlock_group(sb, grp); 996 ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); 997 /* 998 * We only get here in the ERRORS_RO case; relocking the group 999 * may be dangerous, but nothing bad will happen since the 1000 * filesystem will have already been marked read/only and the 1001 * journal has been aborted. We return 1 as a hint to callers 1002 * who might what to use the return value from 1003 * ext4_grp_locked_error() to distinguish between the 1004 * ERRORS_CONT and ERRORS_RO case, and perhaps return more 1005 * aggressively from the ext4 function in question, with a 1006 * more appropriate error code. 1007 */ 1008 ext4_lock_group(sb, grp); 1009 return; 1010} 1011 1012void ext4_mark_group_bitmap_corrupted(struct super_block *sb, 1013 ext4_group_t group, 1014 unsigned int flags) 1015{ 1016 struct ext4_sb_info *sbi = EXT4_SB(sb); 1017 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 1018 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 1019 int ret; 1020 1021 if (!grp || !gdp) 1022 return; 1023 if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { 1024 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 1025 &grp->bb_state); 1026 if (!ret) 1027 percpu_counter_sub(&sbi->s_freeclusters_counter, 1028 grp->bb_free); 1029 } 1030 1031 if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { 1032 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, 1033 &grp->bb_state); 1034 if (!ret && gdp) { 1035 int count; 1036 1037 count = ext4_free_inodes_count(sb, gdp); 1038 percpu_counter_sub(&sbi->s_freeinodes_counter, 1039 count); 1040 } 1041 } 1042} 1043 1044void ext4_update_dynamic_rev(struct super_block *sb) 1045{ 1046 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 1047 1048 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 1049 return; 1050 1051 ext4_warning(sb, 1052 "updating to rev %d because of new feature flag, " 1053 "running e2fsck is recommended", 1054 EXT4_DYNAMIC_REV); 1055 1056 es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); 1057 es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); 1058 es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); 1059 /* leave es->s_feature_*compat flags alone */ 1060 /* es->s_uuid will be set by e2fsck if empty */ 1061 1062 /* 1063 * The rest of the superblock fields should be zero, and if not it 1064 * means they are likely already in use, so leave them alone. We 1065 * can leave it up to e2fsck to clean up any inconsistencies there. 1066 */ 1067} 1068 1069/* 1070 * Open the external journal device 1071 */ 1072static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) 1073{ 1074 struct block_device *bdev; 1075 1076 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); 1077 if (IS_ERR(bdev)) 1078 goto fail; 1079 return bdev; 1080 1081fail: 1082 ext4_msg(sb, KERN_ERR, 1083 "failed to open journal device unknown-block(%u,%u) %ld", 1084 MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); 1085 return NULL; 1086} 1087 1088/* 1089 * Release the journal device 1090 */ 1091static void ext4_blkdev_put(struct block_device *bdev) 1092{ 1093 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1094} 1095 1096static void ext4_blkdev_remove(struct ext4_sb_info *sbi) 1097{ 1098 struct block_device *bdev; 1099 bdev = sbi->s_journal_bdev; 1100 if (bdev) { 1101 /* 1102 * Invalidate the journal device's buffers. We don't want them 1103 * floating about in memory - the physical journal device may 1104 * hotswapped, and it breaks the `ro-after' testing code. 1105 */ 1106 invalidate_bdev(bdev); 1107 ext4_blkdev_put(bdev); 1108 sbi->s_journal_bdev = NULL; 1109 } 1110} 1111 1112static inline struct inode *orphan_list_entry(struct list_head *l) 1113{ 1114 return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; 1115} 1116 1117static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) 1118{ 1119 struct list_head *l; 1120 1121 ext4_msg(sb, KERN_ERR, "sb orphan head is %d", 1122 le32_to_cpu(sbi->s_es->s_last_orphan)); 1123 1124 printk(KERN_ERR "sb_info orphan list:\n"); 1125 list_for_each(l, &sbi->s_orphan) { 1126 struct inode *inode = orphan_list_entry(l); 1127 printk(KERN_ERR " " 1128 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", 1129 inode->i_sb->s_id, inode->i_ino, inode, 1130 inode->i_mode, inode->i_nlink, 1131 NEXT_ORPHAN(inode)); 1132 } 1133} 1134 1135#ifdef CONFIG_QUOTA 1136static int ext4_quota_off(struct super_block *sb, int type); 1137 1138static inline void ext4_quota_off_umount(struct super_block *sb) 1139{ 1140 int type; 1141 1142 /* Use our quota_off function to clear inode flags etc. */ 1143 for (type = 0; type < EXT4_MAXQUOTAS; type++) 1144 ext4_quota_off(sb, type); 1145} 1146 1147/* 1148 * This is a helper function which is used in the mount/remount 1149 * codepaths (which holds s_umount) to fetch the quota file name. 1150 */ 1151static inline char *get_qf_name(struct super_block *sb, 1152 struct ext4_sb_info *sbi, 1153 int type) 1154{ 1155 return rcu_dereference_protected(sbi->s_qf_names[type], 1156 lockdep_is_held(&sb->s_umount)); 1157} 1158#else 1159static inline void ext4_quota_off_umount(struct super_block *sb) 1160{ 1161} 1162#endif 1163 1164static void ext4_put_super(struct super_block *sb) 1165{ 1166 struct ext4_sb_info *sbi = EXT4_SB(sb); 1167 struct ext4_super_block *es = sbi->s_es; 1168 struct buffer_head **group_desc; 1169 struct flex_groups **flex_groups; 1170 int aborted = 0; 1171 int i, err; 1172 1173 /* 1174 * Unregister sysfs before destroying jbd2 journal. 1175 * Since we could still access attr_journal_task attribute via sysfs 1176 * path which could have sbi->s_journal->j_task as NULL 1177 * Unregister sysfs before flush sbi->s_error_work. 1178 * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If 1179 * read metadata verify failed then will queue error work. 1180 * flush_stashed_error_work will call start_this_handle may trigger 1181 * BUG_ON. 1182 */ 1183 ext4_unregister_sysfs(sb); 1184 1185 ext4_unregister_li_request(sb); 1186 ext4_quota_off_umount(sb); 1187 flush_work(&sbi->s_error_work); 1188 destroy_workqueue(sbi->rsv_conversion_wq); 1189 1190 if (sbi->s_journal) { 1191 aborted = is_journal_aborted(sbi->s_journal); 1192 err = jbd2_journal_destroy(sbi->s_journal); 1193 sbi->s_journal = NULL; 1194 if ((err < 0) && !aborted) { 1195 ext4_abort(sb, -err, "Couldn't clean up the journal"); 1196 } 1197 } 1198 1199 ext4_es_unregister_shrinker(sbi); 1200 del_timer_sync(&sbi->s_err_report); 1201 ext4_release_system_zone(sb); 1202 ext4_mb_release(sb); 1203 ext4_ext_release(sb); 1204 1205 if (!sb_rdonly(sb) && !aborted) { 1206 ext4_clear_feature_journal_needs_recovery(sb); 1207 es->s_state = cpu_to_le16(sbi->s_mount_state); 1208 } 1209 if (!sb_rdonly(sb)) 1210 ext4_commit_super(sb); 1211 1212 rcu_read_lock(); 1213 group_desc = rcu_dereference(sbi->s_group_desc); 1214 for (i = 0; i < sbi->s_gdb_count; i++) 1215 brelse(group_desc[i]); 1216 kvfree(group_desc); 1217 flex_groups = rcu_dereference(sbi->s_flex_groups); 1218 if (flex_groups) { 1219 for (i = 0; i < sbi->s_flex_groups_allocated; i++) 1220 kvfree(flex_groups[i]); 1221 kvfree(flex_groups); 1222 } 1223 rcu_read_unlock(); 1224 percpu_counter_destroy(&sbi->s_freeclusters_counter); 1225 percpu_counter_destroy(&sbi->s_freeinodes_counter); 1226 percpu_counter_destroy(&sbi->s_dirs_counter); 1227 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 1228 percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 1229 percpu_free_rwsem(&sbi->s_writepages_rwsem); 1230#ifdef CONFIG_QUOTA 1231 for (i = 0; i < EXT4_MAXQUOTAS; i++) 1232 kfree(get_qf_name(sb, sbi, i)); 1233#endif 1234 1235 /* Debugging code just in case the in-memory inode orphan list 1236 * isn't empty. The on-disk one can be non-empty if we've 1237 * detected an error and taken the fs readonly, but the 1238 * in-memory list had better be clean by this point. */ 1239 if (!list_empty(&sbi->s_orphan)) 1240 dump_orphan_list(sb, sbi); 1241 J_ASSERT(list_empty(&sbi->s_orphan)); 1242 1243 sync_blockdev(sb->s_bdev); 1244 invalidate_bdev(sb->s_bdev); 1245 if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { 1246 sync_blockdev(sbi->s_journal_bdev); 1247 ext4_blkdev_remove(sbi); 1248 } 1249 1250 ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); 1251 sbi->s_ea_inode_cache = NULL; 1252 1253 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 1254 sbi->s_ea_block_cache = NULL; 1255 1256 ext4_stop_mmpd(sbi); 1257 1258 brelse(sbi->s_sbh); 1259 sb->s_fs_info = NULL; 1260 /* 1261 * Now that we are completely done shutting down the 1262 * superblock, we need to actually destroy the kobject. 1263 */ 1264 kobject_put(&sbi->s_kobj); 1265 wait_for_completion(&sbi->s_kobj_unregister); 1266 if (sbi->s_chksum_driver) 1267 crypto_free_shash(sbi->s_chksum_driver); 1268 kfree(sbi->s_blockgroup_lock); 1269 fs_put_dax(sbi->s_daxdev); 1270 fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); 1271#ifdef CONFIG_UNICODE 1272 utf8_unload(sb->s_encoding); 1273#endif 1274 kfree(sbi); 1275} 1276 1277static struct kmem_cache *ext4_inode_cachep; 1278 1279/* 1280 * Called inside transaction, so use GFP_NOFS 1281 */ 1282static struct inode *ext4_alloc_inode(struct super_block *sb) 1283{ 1284 struct ext4_inode_info *ei; 1285 1286 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 1287 if (!ei) 1288 return NULL; 1289 1290 inode_set_iversion(&ei->vfs_inode, 1); 1291 ei->i_flags = 0; 1292 spin_lock_init(&ei->i_raw_lock); 1293 INIT_LIST_HEAD(&ei->i_prealloc_list); 1294 atomic_set(&ei->i_prealloc_active, 0); 1295 spin_lock_init(&ei->i_prealloc_lock); 1296 ext4_es_init_tree(&ei->i_es_tree); 1297 rwlock_init(&ei->i_es_lock); 1298 INIT_LIST_HEAD(&ei->i_es_list); 1299 ei->i_es_all_nr = 0; 1300 ei->i_es_shk_nr = 0; 1301 ei->i_es_shrink_lblk = 0; 1302 ei->i_reserved_data_blocks = 0; 1303 spin_lock_init(&(ei->i_block_reservation_lock)); 1304 ext4_init_pending_tree(&ei->i_pending_tree); 1305#ifdef CONFIG_QUOTA 1306 ei->i_reserved_quota = 0; 1307 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); 1308#endif 1309 ei->jinode = NULL; 1310 INIT_LIST_HEAD(&ei->i_rsv_conversion_list); 1311 spin_lock_init(&ei->i_completed_io_lock); 1312 ei->i_sync_tid = 0; 1313 ei->i_datasync_tid = 0; 1314 atomic_set(&ei->i_unwritten, 0); 1315 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 1316 ext4_fc_init_inode(&ei->vfs_inode); 1317 mutex_init(&ei->i_fc_lock); 1318 return &ei->vfs_inode; 1319} 1320 1321static int ext4_drop_inode(struct inode *inode) 1322{ 1323 int drop = generic_drop_inode(inode); 1324 1325 if (!drop) 1326 drop = fscrypt_drop_inode(inode); 1327 1328 trace_ext4_drop_inode(inode, drop); 1329 return drop; 1330} 1331 1332static void ext4_free_in_core_inode(struct inode *inode) 1333{ 1334 fscrypt_free_inode(inode); 1335 if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { 1336 pr_warn("%s: inode %ld still in fc list", 1337 __func__, inode->i_ino); 1338 } 1339 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 1340} 1341 1342static void ext4_destroy_inode(struct inode *inode) 1343{ 1344 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 1345 ext4_msg(inode->i_sb, KERN_ERR, 1346 "Inode %lu (%p): orphan list check failed!", 1347 inode->i_ino, EXT4_I(inode)); 1348 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, 1349 EXT4_I(inode), sizeof(struct ext4_inode_info), 1350 true); 1351 dump_stack(); 1352 } 1353 1354 if (EXT4_I(inode)->i_reserved_data_blocks) 1355 ext4_msg(inode->i_sb, KERN_ERR, 1356 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", 1357 inode->i_ino, EXT4_I(inode), 1358 EXT4_I(inode)->i_reserved_data_blocks); 1359} 1360 1361static void init_once(void *foo) 1362{ 1363 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 1364 1365 INIT_LIST_HEAD(&ei->i_orphan); 1366 init_rwsem(&ei->xattr_sem); 1367 init_rwsem(&ei->i_data_sem); 1368 init_rwsem(&ei->i_mmap_sem); 1369 inode_init_once(&ei->vfs_inode); 1370 ext4_fc_init_inode(&ei->vfs_inode); 1371} 1372 1373static int __init init_inodecache(void) 1374{ 1375 ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", 1376 sizeof(struct ext4_inode_info), 0, 1377 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| 1378 SLAB_ACCOUNT), 1379 offsetof(struct ext4_inode_info, i_data), 1380 sizeof_field(struct ext4_inode_info, i_data), 1381 init_once); 1382 if (ext4_inode_cachep == NULL) 1383 return -ENOMEM; 1384 return 0; 1385} 1386 1387static void destroy_inodecache(void) 1388{ 1389 /* 1390 * Make sure all delayed rcu free inodes are flushed before we 1391 * destroy cache. 1392 */ 1393 rcu_barrier(); 1394 kmem_cache_destroy(ext4_inode_cachep); 1395} 1396 1397void ext4_clear_inode(struct inode *inode) 1398{ 1399 ext4_fc_del(inode); 1400 invalidate_inode_buffers(inode); 1401 clear_inode(inode); 1402 ext4_discard_preallocations(inode, 0); 1403 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 1404 dquot_drop(inode); 1405 if (EXT4_I(inode)->jinode) { 1406 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 1407 EXT4_I(inode)->jinode); 1408 jbd2_free_inode(EXT4_I(inode)->jinode); 1409 EXT4_I(inode)->jinode = NULL; 1410 } 1411 fscrypt_put_encryption_info(inode); 1412 fsverity_cleanup_inode(inode); 1413} 1414 1415static struct inode *ext4_nfs_get_inode(struct super_block *sb, 1416 u64 ino, u32 generation) 1417{ 1418 struct inode *inode; 1419 1420 /* 1421 * Currently we don't know the generation for parent directory, so 1422 * a generation of 0 means "accept any" 1423 */ 1424 inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); 1425 if (IS_ERR(inode)) 1426 return ERR_CAST(inode); 1427 if (generation && inode->i_generation != generation) { 1428 iput(inode); 1429 return ERR_PTR(-ESTALE); 1430 } 1431 1432 return inode; 1433} 1434 1435static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, 1436 int fh_len, int fh_type) 1437{ 1438 return generic_fh_to_dentry(sb, fid, fh_len, fh_type, 1439 ext4_nfs_get_inode); 1440} 1441 1442static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, 1443 int fh_len, int fh_type) 1444{ 1445 return generic_fh_to_parent(sb, fid, fh_len, fh_type, 1446 ext4_nfs_get_inode); 1447} 1448 1449static int ext4_nfs_commit_metadata(struct inode *inode) 1450{ 1451 struct writeback_control wbc = { 1452 .sync_mode = WB_SYNC_ALL 1453 }; 1454 1455 trace_ext4_nfs_commit_metadata(inode); 1456 return ext4_write_inode(inode, &wbc); 1457} 1458 1459#ifdef CONFIG_FS_ENCRYPTION 1460static int ext4_get_context(struct inode *inode, void *ctx, size_t len) 1461{ 1462 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, 1463 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); 1464} 1465 1466static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, 1467 void *fs_data) 1468{ 1469 handle_t *handle = fs_data; 1470 int res, res2, credits, retries = 0; 1471 1472 /* 1473 * Encrypting the root directory is not allowed because e2fsck expects 1474 * lost+found to exist and be unencrypted, and encrypting the root 1475 * directory would imply encrypting the lost+found directory as well as 1476 * the filename "lost+found" itself. 1477 */ 1478 if (inode->i_ino == EXT4_ROOT_INO) 1479 return -EPERM; 1480 1481 if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) 1482 return -EINVAL; 1483 1484 if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) 1485 return -EOPNOTSUPP; 1486 1487 res = ext4_convert_inline_data(inode); 1488 if (res) 1489 return res; 1490 1491 /* 1492 * If a journal handle was specified, then the encryption context is 1493 * being set on a new inode via inheritance and is part of a larger 1494 * transaction to create the inode. Otherwise the encryption context is 1495 * being set on an existing inode in its own transaction. Only in the 1496 * latter case should the "retry on ENOSPC" logic be used. 1497 */ 1498 1499 if (handle) { 1500 res = ext4_xattr_set_handle(handle, inode, 1501 EXT4_XATTR_INDEX_ENCRYPTION, 1502 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 1503 ctx, len, 0); 1504 if (!res) { 1505 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 1506 ext4_clear_inode_state(inode, 1507 EXT4_STATE_MAY_INLINE_DATA); 1508 /* 1509 * Update inode->i_flags - S_ENCRYPTED will be enabled, 1510 * S_DAX may be disabled 1511 */ 1512 ext4_set_inode_flags(inode, false); 1513 } 1514 return res; 1515 } 1516 1517 res = dquot_initialize(inode); 1518 if (res) 1519 return res; 1520retry: 1521 res = ext4_xattr_set_credits(inode, len, false /* is_create */, 1522 &credits); 1523 if (res) 1524 return res; 1525 1526 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 1527 if (IS_ERR(handle)) 1528 return PTR_ERR(handle); 1529 1530 res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, 1531 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 1532 ctx, len, 0); 1533 if (!res) { 1534 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 1535 /* 1536 * Update inode->i_flags - S_ENCRYPTED will be enabled, 1537 * S_DAX may be disabled 1538 */ 1539 ext4_set_inode_flags(inode, false); 1540 res = ext4_mark_inode_dirty(handle, inode); 1541 if (res) 1542 EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); 1543 } 1544 res2 = ext4_journal_stop(handle); 1545 1546 if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1547 goto retry; 1548 if (!res) 1549 res = res2; 1550 return res; 1551} 1552 1553static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) 1554{ 1555 return EXT4_SB(sb)->s_dummy_enc_policy.policy; 1556} 1557 1558static bool ext4_has_stable_inodes(struct super_block *sb) 1559{ 1560 return ext4_has_feature_stable_inodes(sb); 1561} 1562 1563static void ext4_get_ino_and_lblk_bits(struct super_block *sb, 1564 int *ino_bits_ret, int *lblk_bits_ret) 1565{ 1566 *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); 1567 *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); 1568} 1569 1570static const struct fscrypt_operations ext4_cryptops = { 1571 .key_prefix = "ext4:", 1572 .get_context = ext4_get_context, 1573 .set_context = ext4_set_context, 1574 .get_dummy_policy = ext4_get_dummy_policy, 1575 .empty_dir = ext4_empty_dir, 1576 .max_namelen = EXT4_NAME_LEN, 1577 .has_stable_inodes = ext4_has_stable_inodes, 1578 .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, 1579}; 1580#endif 1581 1582#ifdef CONFIG_QUOTA 1583static const char * const quotatypes[] = INITQFNAMES; 1584#define QTYPE2NAME(t) (quotatypes[t]) 1585 1586static int ext4_write_dquot(struct dquot *dquot); 1587static int ext4_acquire_dquot(struct dquot *dquot); 1588static int ext4_release_dquot(struct dquot *dquot); 1589static int ext4_mark_dquot_dirty(struct dquot *dquot); 1590static int ext4_write_info(struct super_block *sb, int type); 1591static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1592 const struct path *path); 1593static int ext4_quota_on_mount(struct super_block *sb, int type); 1594static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1595 size_t len, loff_t off); 1596static ssize_t ext4_quota_write(struct super_block *sb, int type, 1597 const char *data, size_t len, loff_t off); 1598static int ext4_quota_enable(struct super_block *sb, int type, int format_id, 1599 unsigned int flags); 1600static int ext4_enable_quotas(struct super_block *sb); 1601 1602static struct dquot **ext4_get_dquots(struct inode *inode) 1603{ 1604 return EXT4_I(inode)->i_dquot; 1605} 1606 1607static const struct dquot_operations ext4_quota_operations = { 1608 .get_reserved_space = ext4_get_reserved_space, 1609 .write_dquot = ext4_write_dquot, 1610 .acquire_dquot = ext4_acquire_dquot, 1611 .release_dquot = ext4_release_dquot, 1612 .mark_dirty = ext4_mark_dquot_dirty, 1613 .write_info = ext4_write_info, 1614 .alloc_dquot = dquot_alloc, 1615 .destroy_dquot = dquot_destroy, 1616 .get_projid = ext4_get_projid, 1617 .get_inode_usage = ext4_get_inode_usage, 1618 .get_next_id = dquot_get_next_id, 1619}; 1620 1621static const struct quotactl_ops ext4_qctl_operations = { 1622 .quota_on = ext4_quota_on, 1623 .quota_off = ext4_quota_off, 1624 .quota_sync = dquot_quota_sync, 1625 .get_state = dquot_get_state, 1626 .set_info = dquot_set_dqinfo, 1627 .get_dqblk = dquot_get_dqblk, 1628 .set_dqblk = dquot_set_dqblk, 1629 .get_nextdqblk = dquot_get_next_dqblk, 1630}; 1631#endif 1632 1633static const struct super_operations ext4_sops = { 1634 .alloc_inode = ext4_alloc_inode, 1635 .free_inode = ext4_free_in_core_inode, 1636 .destroy_inode = ext4_destroy_inode, 1637 .write_inode = ext4_write_inode, 1638 .dirty_inode = ext4_dirty_inode, 1639 .drop_inode = ext4_drop_inode, 1640 .evict_inode = ext4_evict_inode, 1641 .put_super = ext4_put_super, 1642 .sync_fs = ext4_sync_fs, 1643 .freeze_fs = ext4_freeze, 1644 .unfreeze_fs = ext4_unfreeze, 1645 .statfs = ext4_statfs, 1646 .remount_fs = ext4_remount, 1647 .show_options = ext4_show_options, 1648#ifdef CONFIG_QUOTA 1649 .quota_read = ext4_quota_read, 1650 .quota_write = ext4_quota_write, 1651 .get_dquots = ext4_get_dquots, 1652#endif 1653}; 1654 1655static const struct export_operations ext4_export_ops = { 1656 .fh_to_dentry = ext4_fh_to_dentry, 1657 .fh_to_parent = ext4_fh_to_parent, 1658 .get_parent = ext4_get_parent, 1659 .commit_metadata = ext4_nfs_commit_metadata, 1660}; 1661 1662enum { 1663 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 1664 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1665 Opt_nouid32, Opt_debug, Opt_removed, 1666 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1667 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, 1668 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, 1669 Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, 1670 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1671 Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, 1672 Opt_inlinecrypt, 1673 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1674 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1675 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, 1676 Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, 1677 Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, 1678 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, 1679 Opt_nowarn_on_error, Opt_mblk_io_submit, 1680 Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, 1681 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1682 Opt_inode_readahead_blks, Opt_journal_ioprio, 1683 Opt_dioread_nolock, Opt_dioread_lock, 1684 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1685 Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, 1686 Opt_prefetch_block_bitmaps, 1687#ifdef CONFIG_EXT4_DEBUG 1688 Opt_fc_debug_max_replay, Opt_fc_debug_force 1689#endif 1690}; 1691 1692static const match_table_t tokens = { 1693 {Opt_bsd_df, "bsddf"}, 1694 {Opt_minix_df, "minixdf"}, 1695 {Opt_grpid, "grpid"}, 1696 {Opt_grpid, "bsdgroups"}, 1697 {Opt_nogrpid, "nogrpid"}, 1698 {Opt_nogrpid, "sysvgroups"}, 1699 {Opt_resgid, "resgid=%u"}, 1700 {Opt_resuid, "resuid=%u"}, 1701 {Opt_sb, "sb=%u"}, 1702 {Opt_err_cont, "errors=continue"}, 1703 {Opt_err_panic, "errors=panic"}, 1704 {Opt_err_ro, "errors=remount-ro"}, 1705 {Opt_nouid32, "nouid32"}, 1706 {Opt_debug, "debug"}, 1707 {Opt_removed, "oldalloc"}, 1708 {Opt_removed, "orlov"}, 1709 {Opt_user_xattr, "user_xattr"}, 1710 {Opt_nouser_xattr, "nouser_xattr"}, 1711 {Opt_acl, "acl"}, 1712 {Opt_noacl, "noacl"}, 1713 {Opt_noload, "norecovery"}, 1714 {Opt_noload, "noload"}, 1715 {Opt_removed, "nobh"}, 1716 {Opt_removed, "bh"}, 1717 {Opt_commit, "commit=%u"}, 1718 {Opt_min_batch_time, "min_batch_time=%u"}, 1719 {Opt_max_batch_time, "max_batch_time=%u"}, 1720 {Opt_journal_dev, "journal_dev=%u"}, 1721 {Opt_journal_path, "journal_path=%s"}, 1722 {Opt_journal_checksum, "journal_checksum"}, 1723 {Opt_nojournal_checksum, "nojournal_checksum"}, 1724 {Opt_journal_async_commit, "journal_async_commit"}, 1725 {Opt_abort, "abort"}, 1726 {Opt_data_journal, "data=journal"}, 1727 {Opt_data_ordered, "data=ordered"}, 1728 {Opt_data_writeback, "data=writeback"}, 1729 {Opt_data_err_abort, "data_err=abort"}, 1730 {Opt_data_err_ignore, "data_err=ignore"}, 1731 {Opt_offusrjquota, "usrjquota="}, 1732 {Opt_usrjquota, "usrjquota=%s"}, 1733 {Opt_offgrpjquota, "grpjquota="}, 1734 {Opt_grpjquota, "grpjquota=%s"}, 1735 {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, 1736 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, 1737 {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, 1738 {Opt_grpquota, "grpquota"}, 1739 {Opt_noquota, "noquota"}, 1740 {Opt_quota, "quota"}, 1741 {Opt_usrquota, "usrquota"}, 1742 {Opt_prjquota, "prjquota"}, 1743 {Opt_barrier, "barrier=%u"}, 1744 {Opt_barrier, "barrier"}, 1745 {Opt_nobarrier, "nobarrier"}, 1746 {Opt_i_version, "i_version"}, 1747 {Opt_dax, "dax"}, 1748 {Opt_dax_always, "dax=always"}, 1749 {Opt_dax_inode, "dax=inode"}, 1750 {Opt_dax_never, "dax=never"}, 1751 {Opt_stripe, "stripe=%u"}, 1752 {Opt_delalloc, "delalloc"}, 1753 {Opt_warn_on_error, "warn_on_error"}, 1754 {Opt_nowarn_on_error, "nowarn_on_error"}, 1755 {Opt_lazytime, "lazytime"}, 1756 {Opt_nolazytime, "nolazytime"}, 1757 {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, 1758 {Opt_nodelalloc, "nodelalloc"}, 1759 {Opt_removed, "mblk_io_submit"}, 1760 {Opt_removed, "nomblk_io_submit"}, 1761 {Opt_block_validity, "block_validity"}, 1762 {Opt_noblock_validity, "noblock_validity"}, 1763 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1764 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1765 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1766 {Opt_auto_da_alloc, "auto_da_alloc"}, 1767 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1768 {Opt_dioread_nolock, "dioread_nolock"}, 1769 {Opt_dioread_lock, "nodioread_nolock"}, 1770 {Opt_dioread_lock, "dioread_lock"}, 1771 {Opt_discard, "discard"}, 1772 {Opt_nodiscard, "nodiscard"}, 1773 {Opt_init_itable, "init_itable=%u"}, 1774 {Opt_init_itable, "init_itable"}, 1775 {Opt_noinit_itable, "noinit_itable"}, 1776#ifdef CONFIG_EXT4_DEBUG 1777 {Opt_fc_debug_force, "fc_debug_force"}, 1778 {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, 1779#endif 1780 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, 1781 {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, 1782 {Opt_test_dummy_encryption, "test_dummy_encryption"}, 1783 {Opt_inlinecrypt, "inlinecrypt"}, 1784 {Opt_nombcache, "nombcache"}, 1785 {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ 1786 {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, 1787 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1788 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1789 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 1790 {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ 1791 {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ 1792 {Opt_err, NULL}, 1793}; 1794 1795static ext4_fsblk_t get_sb_block(void **data) 1796{ 1797 ext4_fsblk_t sb_block; 1798 char *options = (char *) *data; 1799 1800 if (!options || strncmp(options, "sb=", 3) != 0) 1801 return 1; /* Default location */ 1802 1803 options += 3; 1804 /* TODO: use simple_strtoll with >32bit ext4 */ 1805 sb_block = simple_strtoul(options, &options, 0); 1806 if (*options && *options != ',') { 1807 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", 1808 (char *) *data); 1809 return 1; 1810 } 1811 if (*options == ',') 1812 options++; 1813 *data = (void *) options; 1814 1815 return sb_block; 1816} 1817 1818#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1819static const char deprecated_msg[] = 1820 "Mount option \"%s\" will be removed by %s\n" 1821 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; 1822 1823#ifdef CONFIG_QUOTA 1824static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) 1825{ 1826 struct ext4_sb_info *sbi = EXT4_SB(sb); 1827 char *qname, *old_qname = get_qf_name(sb, sbi, qtype); 1828 int ret = -1; 1829 1830 if (sb_any_quota_loaded(sb) && !old_qname) { 1831 ext4_msg(sb, KERN_ERR, 1832 "Cannot change journaled " 1833 "quota options when quota turned on"); 1834 return -1; 1835 } 1836 if (ext4_has_feature_quota(sb)) { 1837 ext4_msg(sb, KERN_INFO, "Journaled quota options " 1838 "ignored when QUOTA feature is enabled"); 1839 return 1; 1840 } 1841 qname = match_strdup(args); 1842 if (!qname) { 1843 ext4_msg(sb, KERN_ERR, 1844 "Not enough memory for storing quotafile name"); 1845 return -1; 1846 } 1847 if (old_qname) { 1848 if (strcmp(old_qname, qname) == 0) 1849 ret = 1; 1850 else 1851 ext4_msg(sb, KERN_ERR, 1852 "%s quota file already specified", 1853 QTYPE2NAME(qtype)); 1854 goto errout; 1855 } 1856 if (strchr(qname, '/')) { 1857 ext4_msg(sb, KERN_ERR, 1858 "quotafile must be on filesystem root"); 1859 goto errout; 1860 } 1861 rcu_assign_pointer(sbi->s_qf_names[qtype], qname); 1862 set_opt(sb, QUOTA); 1863 return 1; 1864errout: 1865 kfree(qname); 1866 return ret; 1867} 1868 1869static int clear_qf_name(struct super_block *sb, int qtype) 1870{ 1871 1872 struct ext4_sb_info *sbi = EXT4_SB(sb); 1873 char *old_qname = get_qf_name(sb, sbi, qtype); 1874 1875 if (sb_any_quota_loaded(sb) && old_qname) { 1876 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" 1877 " when quota turned on"); 1878 return -1; 1879 } 1880 rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); 1881 synchronize_rcu(); 1882 kfree(old_qname); 1883 return 1; 1884} 1885#endif 1886 1887#define MOPT_SET 0x0001 1888#define MOPT_CLEAR 0x0002 1889#define MOPT_NOSUPPORT 0x0004 1890#define MOPT_EXPLICIT 0x0008 1891#define MOPT_CLEAR_ERR 0x0010 1892#define MOPT_GTE0 0x0020 1893#ifdef CONFIG_QUOTA 1894#define MOPT_Q 0 1895#define MOPT_QFMT 0x0040 1896#else 1897#define MOPT_Q MOPT_NOSUPPORT 1898#define MOPT_QFMT MOPT_NOSUPPORT 1899#endif 1900#define MOPT_DATAJ 0x0080 1901#define MOPT_NO_EXT2 0x0100 1902#define MOPT_NO_EXT3 0x0200 1903#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) 1904#define MOPT_STRING 0x0400 1905#define MOPT_SKIP 0x0800 1906#define MOPT_2 0x1000 1907 1908static const struct mount_opts { 1909 int token; 1910 int mount_opt; 1911 int flags; 1912} ext4_mount_opts[] = { 1913 {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, 1914 {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, 1915 {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, 1916 {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, 1917 {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, 1918 {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, 1919 {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, 1920 MOPT_EXT4_ONLY | MOPT_SET}, 1921 {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, 1922 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1923 {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, 1924 {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, 1925 {Opt_delalloc, EXT4_MOUNT_DELALLOC, 1926 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1927 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, 1928 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1929 {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, 1930 {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, 1931 {Opt_commit, 0, MOPT_NO_EXT2}, 1932 {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1933 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1934 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1935 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1936 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | 1937 EXT4_MOUNT_JOURNAL_CHECKSUM), 1938 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1939 {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, 1940 {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, 1941 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, 1942 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, 1943 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, 1944 MOPT_NO_EXT2}, 1945 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, 1946 MOPT_NO_EXT2}, 1947 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, 1948 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, 1949 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, 1950 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, 1951 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, 1952 {Opt_commit, 0, MOPT_GTE0}, 1953 {Opt_max_batch_time, 0, MOPT_GTE0}, 1954 {Opt_min_batch_time, 0, MOPT_GTE0}, 1955 {Opt_inode_readahead_blks, 0, MOPT_GTE0}, 1956 {Opt_init_itable, 0, MOPT_GTE0}, 1957 {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, 1958 {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, 1959 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, 1960 {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, 1961 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, 1962 {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, 1963 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, 1964 {Opt_stripe, 0, MOPT_GTE0}, 1965 {Opt_resuid, 0, MOPT_GTE0}, 1966 {Opt_resgid, 0, MOPT_GTE0}, 1967 {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, 1968 {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, 1969 {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, 1970 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, 1971 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, 1972 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, 1973 MOPT_NO_EXT2 | MOPT_DATAJ}, 1974 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, 1975 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, 1976#ifdef CONFIG_EXT4_FS_POSIX_ACL 1977 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, 1978 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, 1979#else 1980 {Opt_acl, 0, MOPT_NOSUPPORT}, 1981 {Opt_noacl, 0, MOPT_NOSUPPORT}, 1982#endif 1983 {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, 1984 {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, 1985 {Opt_debug_want_extra_isize, 0, MOPT_GTE0}, 1986 {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, 1987 {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, 1988 MOPT_SET | MOPT_Q}, 1989 {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, 1990 MOPT_SET | MOPT_Q}, 1991 {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, 1992 MOPT_SET | MOPT_Q}, 1993 {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | 1994 EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), 1995 MOPT_CLEAR | MOPT_Q}, 1996 {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, 1997 {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, 1998 {Opt_offusrjquota, 0, MOPT_Q}, 1999 {Opt_offgrpjquota, 0, MOPT_Q}, 2000 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, 2001 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, 2002 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 2003 {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 2004 {Opt_test_dummy_encryption, 0, MOPT_STRING}, 2005 {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, 2006 {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, 2007 MOPT_SET}, 2008#ifdef CONFIG_EXT4_DEBUG 2009 {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, 2010 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, 2011 {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, 2012#endif 2013 {Opt_err, 0, 0} 2014}; 2015 2016#ifdef CONFIG_UNICODE 2017static const struct ext4_sb_encodings { 2018 __u16 magic; 2019 char *name; 2020 char *version; 2021} ext4_sb_encoding_map[] = { 2022 {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"}, 2023}; 2024 2025static int ext4_sb_read_encoding(const struct ext4_super_block *es, 2026 const struct ext4_sb_encodings **encoding, 2027 __u16 *flags) 2028{ 2029 __u16 magic = le16_to_cpu(es->s_encoding); 2030 int i; 2031 2032 for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) 2033 if (magic == ext4_sb_encoding_map[i].magic) 2034 break; 2035 2036 if (i >= ARRAY_SIZE(ext4_sb_encoding_map)) 2037 return -EINVAL; 2038 2039 *encoding = &ext4_sb_encoding_map[i]; 2040 *flags = le16_to_cpu(es->s_encoding_flags); 2041 2042 return 0; 2043} 2044#endif 2045 2046static int ext4_set_test_dummy_encryption(struct super_block *sb, 2047 const char *opt, 2048 const substring_t *arg, 2049 bool is_remount) 2050{ 2051#ifdef CONFIG_FS_ENCRYPTION 2052 struct ext4_sb_info *sbi = EXT4_SB(sb); 2053 int err; 2054 2055 if (!ext4_has_feature_encrypt(sb)) { 2056 ext4_msg(sb, KERN_WARNING, 2057 "test_dummy_encryption requires encrypt feature"); 2058 return -1; 2059 } 2060 2061 /* 2062 * This mount option is just for testing, and it's not worthwhile to 2063 * implement the extra complexity (e.g. RCU protection) that would be 2064 * needed to allow it to be set or changed during remount. We do allow 2065 * it to be specified during remount, but only if there is no change. 2066 */ 2067 if (is_remount && !sbi->s_dummy_enc_policy.policy) { 2068 ext4_msg(sb, KERN_WARNING, 2069 "Can't set test_dummy_encryption on remount"); 2070 return -1; 2071 } 2072 err = fscrypt_set_test_dummy_encryption(sb, arg->from, 2073 &sbi->s_dummy_enc_policy); 2074 if (err) { 2075 if (err == -EEXIST) 2076 ext4_msg(sb, KERN_WARNING, 2077 "Can't change test_dummy_encryption on remount"); 2078 else if (err == -EINVAL) 2079 ext4_msg(sb, KERN_WARNING, 2080 "Value of option \"%s\" is unrecognized", opt); 2081 else 2082 ext4_msg(sb, KERN_WARNING, 2083 "Error processing option \"%s\" [%d]", 2084 opt, err); 2085 return -1; 2086 } 2087 ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); 2088 return 1; 2089#else 2090 ext4_msg(sb, KERN_WARNING, 2091 "test_dummy_encryption option not supported"); 2092 return -1; 2093 2094#endif 2095} 2096 2097static int handle_mount_opt(struct super_block *sb, char *opt, int token, 2098 substring_t *args, unsigned long *journal_devnum, 2099 unsigned int *journal_ioprio, int is_remount) 2100{ 2101 struct ext4_sb_info *sbi = EXT4_SB(sb); 2102 const struct mount_opts *m; 2103 kuid_t uid; 2104 kgid_t gid; 2105 int arg = 0; 2106 2107#ifdef CONFIG_QUOTA 2108 if (token == Opt_usrjquota) 2109 return set_qf_name(sb, USRQUOTA, &args[0]); 2110 else if (token == Opt_grpjquota) 2111 return set_qf_name(sb, GRPQUOTA, &args[0]); 2112 else if (token == Opt_offusrjquota) 2113 return clear_qf_name(sb, USRQUOTA); 2114 else if (token == Opt_offgrpjquota) 2115 return clear_qf_name(sb, GRPQUOTA); 2116#endif 2117 switch (token) { 2118 case Opt_noacl: 2119 case Opt_nouser_xattr: 2120 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); 2121 break; 2122 case Opt_sb: 2123 return 1; /* handled by get_sb_block() */ 2124 case Opt_removed: 2125 ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); 2126 return 1; 2127 case Opt_abort: 2128 ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 2129 return 1; 2130 case Opt_i_version: 2131 sb->s_flags |= SB_I_VERSION; 2132 return 1; 2133 case Opt_lazytime: 2134 sb->s_flags |= SB_LAZYTIME; 2135 return 1; 2136 case Opt_nolazytime: 2137 sb->s_flags &= ~SB_LAZYTIME; 2138 return 1; 2139 case Opt_inlinecrypt: 2140#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT 2141 sb->s_flags |= SB_INLINECRYPT; 2142#else 2143 ext4_msg(sb, KERN_ERR, "inline encryption not supported"); 2144#endif 2145 return 1; 2146 } 2147 2148 for (m = ext4_mount_opts; m->token != Opt_err; m++) 2149 if (token == m->token) 2150 break; 2151 2152 if (m->token == Opt_err) { 2153 ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " 2154 "or missing value", opt); 2155 return -1; 2156 } 2157 2158 if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { 2159 ext4_msg(sb, KERN_ERR, 2160 "Mount option \"%s\" incompatible with ext2", opt); 2161 return -1; 2162 } 2163 if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { 2164 ext4_msg(sb, KERN_ERR, 2165 "Mount option \"%s\" incompatible with ext3", opt); 2166 return -1; 2167 } 2168 2169 if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) 2170 return -1; 2171 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) 2172 return -1; 2173 if (m->flags & MOPT_EXPLICIT) { 2174 if (m->mount_opt & EXT4_MOUNT_DELALLOC) { 2175 set_opt2(sb, EXPLICIT_DELALLOC); 2176 } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { 2177 set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM); 2178 } else 2179 return -1; 2180 } 2181 if (m->flags & MOPT_CLEAR_ERR) 2182 clear_opt(sb, ERRORS_MASK); 2183 if (token == Opt_noquota && sb_any_quota_loaded(sb)) { 2184 ext4_msg(sb, KERN_ERR, "Cannot change quota " 2185 "options when quota turned on"); 2186 return -1; 2187 } 2188 2189 if (m->flags & MOPT_NOSUPPORT) { 2190 ext4_msg(sb, KERN_ERR, "%s option not supported", opt); 2191 } else if (token == Opt_commit) { 2192 if (arg == 0) 2193 arg = JBD2_DEFAULT_MAX_COMMIT_AGE; 2194 else if (arg > INT_MAX / HZ) { 2195 ext4_msg(sb, KERN_ERR, 2196 "Invalid commit interval %d, " 2197 "must be smaller than %d", 2198 arg, INT_MAX / HZ); 2199 return -1; 2200 } 2201 sbi->s_commit_interval = HZ * arg; 2202 } else if (token == Opt_debug_want_extra_isize) { 2203 if ((arg & 1) || 2204 (arg < 4) || 2205 (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { 2206 ext4_msg(sb, KERN_ERR, 2207 "Invalid want_extra_isize %d", arg); 2208 return -1; 2209 } 2210 sbi->s_want_extra_isize = arg; 2211 } else if (token == Opt_max_batch_time) { 2212 sbi->s_max_batch_time = arg; 2213 } else if (token == Opt_min_batch_time) { 2214 sbi->s_min_batch_time = arg; 2215 } else if (token == Opt_inode_readahead_blks) { 2216 if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { 2217 ext4_msg(sb, KERN_ERR, 2218 "EXT4-fs: inode_readahead_blks must be " 2219 "0 or a power of 2 smaller than 2^31"); 2220 return -1; 2221 } 2222 sbi->s_inode_readahead_blks = arg; 2223 } else if (token == Opt_init_itable) { 2224 set_opt(sb, INIT_INODE_TABLE); 2225 if (!args->from) 2226 arg = EXT4_DEF_LI_WAIT_MULT; 2227 sbi->s_li_wait_mult = arg; 2228 } else if (token == Opt_max_dir_size_kb) { 2229 sbi->s_max_dir_size_kb = arg; 2230#ifdef CONFIG_EXT4_DEBUG 2231 } else if (token == Opt_fc_debug_max_replay) { 2232 sbi->s_fc_debug_max_replay = arg; 2233#endif 2234 } else if (token == Opt_stripe) { 2235 sbi->s_stripe = arg; 2236 } else if (token == Opt_resuid) { 2237 uid = make_kuid(current_user_ns(), arg); 2238 if (!uid_valid(uid)) { 2239 ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); 2240 return -1; 2241 } 2242 sbi->s_resuid = uid; 2243 } else if (token == Opt_resgid) { 2244 gid = make_kgid(current_user_ns(), arg); 2245 if (!gid_valid(gid)) { 2246 ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); 2247 return -1; 2248 } 2249 sbi->s_resgid = gid; 2250 } else if (token == Opt_journal_dev) { 2251 if (is_remount) { 2252 ext4_msg(sb, KERN_ERR, 2253 "Cannot specify journal on remount"); 2254 return -1; 2255 } 2256 *journal_devnum = arg; 2257 } else if (token == Opt_journal_path) { 2258 char *journal_path; 2259 struct inode *journal_inode; 2260 struct path path; 2261 int error; 2262 2263 if (is_remount) { 2264 ext4_msg(sb, KERN_ERR, 2265 "Cannot specify journal on remount"); 2266 return -1; 2267 } 2268 journal_path = match_strdup(&args[0]); 2269 if (!journal_path) { 2270 ext4_msg(sb, KERN_ERR, "error: could not dup " 2271 "journal device string"); 2272 return -1; 2273 } 2274 2275 error = kern_path(journal_path, LOOKUP_FOLLOW, &path); 2276 if (error) { 2277 ext4_msg(sb, KERN_ERR, "error: could not find " 2278 "journal device path: error %d", error); 2279 kfree(journal_path); 2280 return -1; 2281 } 2282 2283 journal_inode = d_inode(path.dentry); 2284 if (!S_ISBLK(journal_inode->i_mode)) { 2285 ext4_msg(sb, KERN_ERR, "error: journal path %s " 2286 "is not a block device", journal_path); 2287 path_put(&path); 2288 kfree(journal_path); 2289 return -1; 2290 } 2291 2292 *journal_devnum = new_encode_dev(journal_inode->i_rdev); 2293 path_put(&path); 2294 kfree(journal_path); 2295 } else if (token == Opt_journal_ioprio) { 2296 if (arg > 7) { 2297 ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" 2298 " (must be 0-7)"); 2299 return -1; 2300 } 2301 *journal_ioprio = 2302 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); 2303 } else if (token == Opt_test_dummy_encryption) { 2304 return ext4_set_test_dummy_encryption(sb, opt, &args[0], 2305 is_remount); 2306 } else if (m->flags & MOPT_DATAJ) { 2307 if (is_remount) { 2308 if (!sbi->s_journal) 2309 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); 2310 else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { 2311 ext4_msg(sb, KERN_ERR, 2312 "Cannot change data mode on remount"); 2313 return -1; 2314 } 2315 } else { 2316 clear_opt(sb, DATA_FLAGS); 2317 sbi->s_mount_opt |= m->mount_opt; 2318 } 2319#ifdef CONFIG_QUOTA 2320 } else if (m->flags & MOPT_QFMT) { 2321 if (sb_any_quota_loaded(sb) && 2322 sbi->s_jquota_fmt != m->mount_opt) { 2323 ext4_msg(sb, KERN_ERR, "Cannot change journaled " 2324 "quota options when quota turned on"); 2325 return -1; 2326 } 2327 if (ext4_has_feature_quota(sb)) { 2328 ext4_msg(sb, KERN_INFO, 2329 "Quota format mount options ignored " 2330 "when QUOTA feature is enabled"); 2331 return 1; 2332 } 2333 sbi->s_jquota_fmt = m->mount_opt; 2334#endif 2335 } else if (token == Opt_dax || token == Opt_dax_always || 2336 token == Opt_dax_inode || token == Opt_dax_never) { 2337#ifdef CONFIG_FS_DAX 2338 switch (token) { 2339 case Opt_dax: 2340 case Opt_dax_always: 2341 if (is_remount && 2342 (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || 2343 (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { 2344 fail_dax_change_remount: 2345 ext4_msg(sb, KERN_ERR, "can't change " 2346 "dax mount option while remounting"); 2347 return -1; 2348 } 2349 if (is_remount && 2350 (test_opt(sb, DATA_FLAGS) == 2351 EXT4_MOUNT_JOURNAL_DATA)) { 2352 ext4_msg(sb, KERN_ERR, "can't mount with " 2353 "both data=journal and dax"); 2354 return -1; 2355 } 2356 ext4_msg(sb, KERN_WARNING, 2357 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 2358 sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; 2359 sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; 2360 break; 2361 case Opt_dax_never: 2362 if (is_remount && 2363 (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || 2364 (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) 2365 goto fail_dax_change_remount; 2366 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; 2367 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; 2368 break; 2369 case Opt_dax_inode: 2370 if (is_remount && 2371 ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || 2372 (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || 2373 !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) 2374 goto fail_dax_change_remount; 2375 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; 2376 sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; 2377 /* Strictly for printing options */ 2378 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; 2379 break; 2380 } 2381#else 2382 ext4_msg(sb, KERN_INFO, "dax option not supported"); 2383 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; 2384 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; 2385 return -1; 2386#endif 2387 } else if (token == Opt_data_err_abort) { 2388 sbi->s_mount_opt |= m->mount_opt; 2389 } else if (token == Opt_data_err_ignore) { 2390 sbi->s_mount_opt &= ~m->mount_opt; 2391 } else { 2392 if (!args->from) 2393 arg = 1; 2394 if (m->flags & MOPT_CLEAR) 2395 arg = !arg; 2396 else if (unlikely(!(m->flags & MOPT_SET))) { 2397 ext4_msg(sb, KERN_WARNING, 2398 "buggy handling of option %s", opt); 2399 WARN_ON(1); 2400 return -1; 2401 } 2402 if (m->flags & MOPT_2) { 2403 if (arg != 0) 2404 sbi->s_mount_opt2 |= m->mount_opt; 2405 else 2406 sbi->s_mount_opt2 &= ~m->mount_opt; 2407 } else { 2408 if (arg != 0) 2409 sbi->s_mount_opt |= m->mount_opt; 2410 else 2411 sbi->s_mount_opt &= ~m->mount_opt; 2412 } 2413 } 2414 return 1; 2415} 2416 2417static int parse_options(char *options, struct super_block *sb, 2418 unsigned long *journal_devnum, 2419 unsigned int *journal_ioprio, 2420 int is_remount) 2421{ 2422 struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); 2423 char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; 2424 substring_t args[MAX_OPT_ARGS]; 2425 int token; 2426 2427 if (!options) 2428 return 1; 2429 2430 while ((p = strsep(&options, ",")) != NULL) { 2431 if (!*p) 2432 continue; 2433 /* 2434 * Initialize args struct so we know whether arg was 2435 * found; some options take optional arguments. 2436 */ 2437 args[0].to = args[0].from = NULL; 2438 token = match_token(p, tokens, args); 2439 if (handle_mount_opt(sb, p, token, args, journal_devnum, 2440 journal_ioprio, is_remount) < 0) 2441 return 0; 2442 } 2443#ifdef CONFIG_QUOTA 2444 /* 2445 * We do the test below only for project quotas. 'usrquota' and 2446 * 'grpquota' mount options are allowed even without quota feature 2447 * to support legacy quotas in quota files. 2448 */ 2449 if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { 2450 ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " 2451 "Cannot enable project quota enforcement."); 2452 return 0; 2453 } 2454 usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); 2455 grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); 2456 if (usr_qf_name || grp_qf_name) { 2457 if (test_opt(sb, USRQUOTA) && usr_qf_name) 2458 clear_opt(sb, USRQUOTA); 2459 2460 if (test_opt(sb, GRPQUOTA) && grp_qf_name) 2461 clear_opt(sb, GRPQUOTA); 2462 2463 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 2464 ext4_msg(sb, KERN_ERR, "old and new quota " 2465 "format mixing"); 2466 return 0; 2467 } 2468 2469 if (!sbi->s_jquota_fmt) { 2470 ext4_msg(sb, KERN_ERR, "journaled quota format " 2471 "not specified"); 2472 return 0; 2473 } 2474 } 2475#endif 2476 if (test_opt(sb, DIOREAD_NOLOCK)) { 2477 int blocksize = 2478 BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 2479 if (blocksize < PAGE_SIZE) 2480 ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " 2481 "experimental mount option 'dioread_nolock' " 2482 "for blocksize < PAGE_SIZE"); 2483 } 2484 return 1; 2485} 2486 2487static inline void ext4_show_quota_options(struct seq_file *seq, 2488 struct super_block *sb) 2489{ 2490#if defined(CONFIG_QUOTA) 2491 struct ext4_sb_info *sbi = EXT4_SB(sb); 2492 char *usr_qf_name, *grp_qf_name; 2493 2494 if (sbi->s_jquota_fmt) { 2495 char *fmtname = ""; 2496 2497 switch (sbi->s_jquota_fmt) { 2498 case QFMT_VFS_OLD: 2499 fmtname = "vfsold"; 2500 break; 2501 case QFMT_VFS_V0: 2502 fmtname = "vfsv0"; 2503 break; 2504 case QFMT_VFS_V1: 2505 fmtname = "vfsv1"; 2506 break; 2507 } 2508 seq_printf(seq, ",jqfmt=%s", fmtname); 2509 } 2510 2511 rcu_read_lock(); 2512 usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); 2513 grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); 2514 if (usr_qf_name) 2515 seq_show_option(seq, "usrjquota", usr_qf_name); 2516 if (grp_qf_name) 2517 seq_show_option(seq, "grpjquota", grp_qf_name); 2518 rcu_read_unlock(); 2519#endif 2520} 2521 2522static const char *token2str(int token) 2523{ 2524 const struct match_token *t; 2525 2526 for (t = tokens; t->token != Opt_err; t++) 2527 if (t->token == token && !strchr(t->pattern, '=')) 2528 break; 2529 return t->pattern; 2530} 2531 2532/* 2533 * Show an option if 2534 * - it's set to a non-default value OR 2535 * - if the per-sb default is different from the global default 2536 */ 2537static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, 2538 int nodefs) 2539{ 2540 struct ext4_sb_info *sbi = EXT4_SB(sb); 2541 struct ext4_super_block *es = sbi->s_es; 2542 int def_errors, def_mount_opt = sbi->s_def_mount_opt; 2543 const struct mount_opts *m; 2544 char sep = nodefs ? '\n' : ','; 2545 2546#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) 2547#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) 2548 2549 if (sbi->s_sb_block != 1) 2550 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); 2551 2552 for (m = ext4_mount_opts; m->token != Opt_err; m++) { 2553 int want_set = m->flags & MOPT_SET; 2554 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || 2555 (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) 2556 continue; 2557 if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) 2558 continue; /* skip if same as the default */ 2559 if ((want_set && 2560 (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || 2561 (!want_set && (sbi->s_mount_opt & m->mount_opt))) 2562 continue; /* select Opt_noFoo vs Opt_Foo */ 2563 SEQ_OPTS_PRINT("%s", token2str(m->token)); 2564 } 2565 2566 if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || 2567 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) 2568 SEQ_OPTS_PRINT("resuid=%u", 2569 from_kuid_munged(&init_user_ns, sbi->s_resuid)); 2570 if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || 2571 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) 2572 SEQ_OPTS_PRINT("resgid=%u", 2573 from_kgid_munged(&init_user_ns, sbi->s_resgid)); 2574 def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); 2575 if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) 2576 SEQ_OPTS_PUTS("errors=remount-ro"); 2577 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) 2578 SEQ_OPTS_PUTS("errors=continue"); 2579 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) 2580 SEQ_OPTS_PUTS("errors=panic"); 2581 if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) 2582 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); 2583 if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) 2584 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); 2585 if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) 2586 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); 2587 if (sb->s_flags & SB_I_VERSION) 2588 SEQ_OPTS_PUTS("i_version"); 2589 if (nodefs || sbi->s_stripe) 2590 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); 2591 if (nodefs || EXT4_MOUNT_DATA_FLAGS & 2592 (sbi->s_mount_opt ^ def_mount_opt)) { 2593 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 2594 SEQ_OPTS_PUTS("data=journal"); 2595 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 2596 SEQ_OPTS_PUTS("data=ordered"); 2597 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 2598 SEQ_OPTS_PUTS("data=writeback"); 2599 } 2600 if (nodefs || 2601 sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) 2602 SEQ_OPTS_PRINT("inode_readahead_blks=%u", 2603 sbi->s_inode_readahead_blks); 2604 2605 if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || 2606 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) 2607 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 2608 if (nodefs || sbi->s_max_dir_size_kb) 2609 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); 2610 if (test_opt(sb, DATA_ERR_ABORT)) 2611 SEQ_OPTS_PUTS("data_err=abort"); 2612 2613 fscrypt_show_test_dummy_encryption(seq, sep, sb); 2614 2615 if (sb->s_flags & SB_INLINECRYPT) 2616 SEQ_OPTS_PUTS("inlinecrypt"); 2617 2618 if (test_opt(sb, DAX_ALWAYS)) { 2619 if (IS_EXT2_SB(sb)) 2620 SEQ_OPTS_PUTS("dax"); 2621 else 2622 SEQ_OPTS_PUTS("dax=always"); 2623 } else if (test_opt2(sb, DAX_NEVER)) { 2624 SEQ_OPTS_PUTS("dax=never"); 2625 } else if (test_opt2(sb, DAX_INODE)) { 2626 SEQ_OPTS_PUTS("dax=inode"); 2627 } 2628 ext4_show_quota_options(seq, sb); 2629 return 0; 2630} 2631 2632static int ext4_show_options(struct seq_file *seq, struct dentry *root) 2633{ 2634 return _ext4_show_options(seq, root->d_sb, 0); 2635} 2636 2637int ext4_seq_options_show(struct seq_file *seq, void *offset) 2638{ 2639 struct super_block *sb = seq->private; 2640 int rc; 2641 2642 seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); 2643 rc = _ext4_show_options(seq, sb, 1); 2644 seq_puts(seq, "\n"); 2645 return rc; 2646} 2647 2648static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 2649 int read_only) 2650{ 2651 struct ext4_sb_info *sbi = EXT4_SB(sb); 2652 int err = 0; 2653 2654 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { 2655 ext4_msg(sb, KERN_ERR, "revision level too high, " 2656 "forcing read-only mode"); 2657 err = -EROFS; 2658 goto done; 2659 } 2660 if (read_only) 2661 goto done; 2662 if (!(sbi->s_mount_state & EXT4_VALID_FS)) 2663 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " 2664 "running e2fsck is recommended"); 2665 else if (sbi->s_mount_state & EXT4_ERROR_FS) 2666 ext4_msg(sb, KERN_WARNING, 2667 "warning: mounting fs with errors, " 2668 "running e2fsck is recommended"); 2669 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && 2670 le16_to_cpu(es->s_mnt_count) >= 2671 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 2672 ext4_msg(sb, KERN_WARNING, 2673 "warning: maximal mount count reached, " 2674 "running e2fsck is recommended"); 2675 else if (le32_to_cpu(es->s_checkinterval) && 2676 (ext4_get_tstamp(es, s_lastcheck) + 2677 le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) 2678 ext4_msg(sb, KERN_WARNING, 2679 "warning: checktime reached, " 2680 "running e2fsck is recommended"); 2681 if (!sbi->s_journal) 2682 es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 2683 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 2684 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 2685 le16_add_cpu(&es->s_mnt_count, 1); 2686 ext4_update_tstamp(es, s_mtime); 2687 if (sbi->s_journal) 2688 ext4_set_feature_journal_needs_recovery(sb); 2689 2690 err = ext4_commit_super(sb); 2691done: 2692 if (test_opt(sb, DEBUG)) 2693 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 2694 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", 2695 sb->s_blocksize, 2696 sbi->s_groups_count, 2697 EXT4_BLOCKS_PER_GROUP(sb), 2698 EXT4_INODES_PER_GROUP(sb), 2699 sbi->s_mount_opt, sbi->s_mount_opt2); 2700 2701 cleancache_init_fs(sb); 2702 return err; 2703} 2704 2705int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) 2706{ 2707 struct ext4_sb_info *sbi = EXT4_SB(sb); 2708 struct flex_groups **old_groups, **new_groups; 2709 int size, i, j; 2710 2711 if (!sbi->s_log_groups_per_flex) 2712 return 0; 2713 2714 size = ext4_flex_group(sbi, ngroup - 1) + 1; 2715 if (size <= sbi->s_flex_groups_allocated) 2716 return 0; 2717 2718 new_groups = kvzalloc(roundup_pow_of_two(size * 2719 sizeof(*sbi->s_flex_groups)), GFP_KERNEL); 2720 if (!new_groups) { 2721 ext4_msg(sb, KERN_ERR, 2722 "not enough memory for %d flex group pointers", size); 2723 return -ENOMEM; 2724 } 2725 for (i = sbi->s_flex_groups_allocated; i < size; i++) { 2726 new_groups[i] = kvzalloc(roundup_pow_of_two( 2727 sizeof(struct flex_groups)), 2728 GFP_KERNEL); 2729 if (!new_groups[i]) { 2730 for (j = sbi->s_flex_groups_allocated; j < i; j++) 2731 kvfree(new_groups[j]); 2732 kvfree(new_groups); 2733 ext4_msg(sb, KERN_ERR, 2734 "not enough memory for %d flex groups", size); 2735 return -ENOMEM; 2736 } 2737 } 2738 rcu_read_lock(); 2739 old_groups = rcu_dereference(sbi->s_flex_groups); 2740 if (old_groups) 2741 memcpy(new_groups, old_groups, 2742 (sbi->s_flex_groups_allocated * 2743 sizeof(struct flex_groups *))); 2744 rcu_read_unlock(); 2745 rcu_assign_pointer(sbi->s_flex_groups, new_groups); 2746 sbi->s_flex_groups_allocated = size; 2747 if (old_groups) 2748 ext4_kvfree_array_rcu(old_groups); 2749 return 0; 2750} 2751 2752static int ext4_fill_flex_info(struct super_block *sb) 2753{ 2754 struct ext4_sb_info *sbi = EXT4_SB(sb); 2755 struct ext4_group_desc *gdp = NULL; 2756 struct flex_groups *fg; 2757 ext4_group_t flex_group; 2758 int i, err; 2759 2760 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 2761 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { 2762 sbi->s_log_groups_per_flex = 0; 2763 return 1; 2764 } 2765 2766 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 2767 if (err) 2768 goto failed; 2769 2770 for (i = 0; i < sbi->s_groups_count; i++) { 2771 gdp = ext4_get_group_desc(sb, i, NULL); 2772 2773 flex_group = ext4_flex_group(sbi, i); 2774 fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); 2775 atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); 2776 atomic64_add(ext4_free_group_clusters(sb, gdp), 2777 &fg->free_clusters); 2778 atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); 2779 } 2780 2781 return 1; 2782failed: 2783 return 0; 2784} 2785 2786static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, 2787 struct ext4_group_desc *gdp) 2788{ 2789 int offset = offsetof(struct ext4_group_desc, bg_checksum); 2790 __u16 crc = 0; 2791 __le32 le_group = cpu_to_le32(block_group); 2792 struct ext4_sb_info *sbi = EXT4_SB(sb); 2793 2794 if (ext4_has_metadata_csum(sbi->s_sb)) { 2795 /* Use new metadata_csum algorithm */ 2796 __u32 csum32; 2797 __u16 dummy_csum = 0; 2798 2799 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, 2800 sizeof(le_group)); 2801 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); 2802 csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, 2803 sizeof(dummy_csum)); 2804 offset += sizeof(dummy_csum); 2805 if (offset < sbi->s_desc_size) 2806 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, 2807 sbi->s_desc_size - offset); 2808 2809 crc = csum32 & 0xFFFF; 2810 goto out; 2811 } 2812 2813 /* old crc16 code */ 2814 if (!ext4_has_feature_gdt_csum(sb)) 2815 return 0; 2816 2817 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); 2818 crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); 2819 crc = crc16(crc, (__u8 *)gdp, offset); 2820 offset += sizeof(gdp->bg_checksum); /* skip checksum */ 2821 /* for checksum of struct ext4_group_desc do the rest...*/ 2822 if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) 2823 crc = crc16(crc, (__u8 *)gdp + offset, 2824 sbi->s_desc_size - offset); 2825 2826out: 2827 return cpu_to_le16(crc); 2828} 2829 2830int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, 2831 struct ext4_group_desc *gdp) 2832{ 2833 if (ext4_has_group_desc_csum(sb) && 2834 (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) 2835 return 0; 2836 2837 return 1; 2838} 2839 2840void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, 2841 struct ext4_group_desc *gdp) 2842{ 2843 if (!ext4_has_group_desc_csum(sb)) 2844 return; 2845 gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); 2846} 2847 2848/* Called at mount-time, super-block is locked */ 2849static int ext4_check_descriptors(struct super_block *sb, 2850 ext4_fsblk_t sb_block, 2851 ext4_group_t *first_not_zeroed) 2852{ 2853 struct ext4_sb_info *sbi = EXT4_SB(sb); 2854 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 2855 ext4_fsblk_t last_block; 2856 ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); 2857 ext4_fsblk_t block_bitmap; 2858 ext4_fsblk_t inode_bitmap; 2859 ext4_fsblk_t inode_table; 2860 int flexbg_flag = 0; 2861 ext4_group_t i, grp = sbi->s_groups_count; 2862 2863 if (ext4_has_feature_flex_bg(sb)) 2864 flexbg_flag = 1; 2865 2866 ext4_debug("Checking group descriptors"); 2867 2868 for (i = 0; i < sbi->s_groups_count; i++) { 2869 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 2870 2871 if (i == sbi->s_groups_count - 1 || flexbg_flag) 2872 last_block = ext4_blocks_count(sbi->s_es) - 1; 2873 else 2874 last_block = first_block + 2875 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2876 2877 if ((grp == sbi->s_groups_count) && 2878 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) 2879 grp = i; 2880 2881 block_bitmap = ext4_block_bitmap(sb, gdp); 2882 if (block_bitmap == sb_block) { 2883 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2884 "Block bitmap for group %u overlaps " 2885 "superblock", i); 2886 if (!sb_rdonly(sb)) 2887 return 0; 2888 } 2889 if (block_bitmap >= sb_block + 1 && 2890 block_bitmap <= last_bg_block) { 2891 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2892 "Block bitmap for group %u overlaps " 2893 "block group descriptors", i); 2894 if (!sb_rdonly(sb)) 2895 return 0; 2896 } 2897 if (block_bitmap < first_block || block_bitmap > last_block) { 2898 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2899 "Block bitmap for group %u not in group " 2900 "(block %llu)!", i, block_bitmap); 2901 return 0; 2902 } 2903 inode_bitmap = ext4_inode_bitmap(sb, gdp); 2904 if (inode_bitmap == sb_block) { 2905 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2906 "Inode bitmap for group %u overlaps " 2907 "superblock", i); 2908 if (!sb_rdonly(sb)) 2909 return 0; 2910 } 2911 if (inode_bitmap >= sb_block + 1 && 2912 inode_bitmap <= last_bg_block) { 2913 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2914 "Inode bitmap for group %u overlaps " 2915 "block group descriptors", i); 2916 if (!sb_rdonly(sb)) 2917 return 0; 2918 } 2919 if (inode_bitmap < first_block || inode_bitmap > last_block) { 2920 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2921 "Inode bitmap for group %u not in group " 2922 "(block %llu)!", i, inode_bitmap); 2923 return 0; 2924 } 2925 inode_table = ext4_inode_table(sb, gdp); 2926 if (inode_table == sb_block) { 2927 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2928 "Inode table for group %u overlaps " 2929 "superblock", i); 2930 if (!sb_rdonly(sb)) 2931 return 0; 2932 } 2933 if (inode_table >= sb_block + 1 && 2934 inode_table <= last_bg_block) { 2935 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2936 "Inode table for group %u overlaps " 2937 "block group descriptors", i); 2938 if (!sb_rdonly(sb)) 2939 return 0; 2940 } 2941 if (inode_table < first_block || 2942 inode_table + sbi->s_itb_per_group - 1 > last_block) { 2943 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2944 "Inode table for group %u not in group " 2945 "(block %llu)!", i, inode_table); 2946 return 0; 2947 } 2948 ext4_lock_group(sb, i); 2949 if (!ext4_group_desc_csum_verify(sb, i, gdp)) { 2950 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2951 "Checksum for group %u failed (%u!=%u)", 2952 i, le16_to_cpu(ext4_group_desc_csum(sb, i, 2953 gdp)), le16_to_cpu(gdp->bg_checksum)); 2954 if (!sb_rdonly(sb)) { 2955 ext4_unlock_group(sb, i); 2956 return 0; 2957 } 2958 } 2959 ext4_unlock_group(sb, i); 2960 if (!flexbg_flag) 2961 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2962 } 2963 if (NULL != first_not_zeroed) 2964 *first_not_zeroed = grp; 2965 return 1; 2966} 2967 2968/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at 2969 * the superblock) which were deleted from all directories, but held open by 2970 * a process at the time of a crash. We walk the list and try to delete these 2971 * inodes at recovery time (only with a read-write filesystem). 2972 * 2973 * In order to keep the orphan inode chain consistent during traversal (in 2974 * case of crash during recovery), we link each inode into the superblock 2975 * orphan list_head and handle it the same way as an inode deletion during 2976 * normal operation (which journals the operations for us). 2977 * 2978 * We only do an iget() and an iput() on each inode, which is very safe if we 2979 * accidentally point at an in-use or already deleted inode. The worst that 2980 * can happen in this case is that we get a "bit already cleared" message from 2981 * ext4_free_inode(). The only reason we would point at a wrong inode is if 2982 * e2fsck was run on this filesystem, and it must have already done the orphan 2983 * inode cleanup for us, so we can safely abort without any further action. 2984 */ 2985static void ext4_orphan_cleanup(struct super_block *sb, 2986 struct ext4_super_block *es) 2987{ 2988 unsigned int s_flags = sb->s_flags; 2989 int ret, nr_orphans = 0, nr_truncates = 0; 2990#ifdef CONFIG_QUOTA 2991 int quota_update = 0; 2992 int i; 2993#endif 2994 if (!es->s_last_orphan) { 2995 jbd_debug(4, "no orphan inodes to clean up\n"); 2996 return; 2997 } 2998 2999 if (bdev_read_only(sb->s_bdev)) { 3000 ext4_msg(sb, KERN_ERR, "write access " 3001 "unavailable, skipping orphan cleanup"); 3002 return; 3003 } 3004 3005 /* Check if feature set would not allow a r/w mount */ 3006 if (!ext4_feature_set_ok(sb, 0)) { 3007 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " 3008 "unknown ROCOMPAT features"); 3009 return; 3010 } 3011 3012 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 3013 /* don't clear list on RO mount w/ errors */ 3014 if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { 3015 ext4_msg(sb, KERN_INFO, "Errors on filesystem, " 3016 "clearing orphan list.\n"); 3017 es->s_last_orphan = 0; 3018 } 3019 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 3020 return; 3021 } 3022 3023 if (s_flags & SB_RDONLY) { 3024 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); 3025 sb->s_flags &= ~SB_RDONLY; 3026 } 3027#ifdef CONFIG_QUOTA 3028 /* 3029 * Turn on quotas which were not enabled for read-only mounts if 3030 * filesystem has quota feature, so that they are updated correctly. 3031 */ 3032 if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { 3033 int ret = ext4_enable_quotas(sb); 3034 3035 if (!ret) 3036 quota_update = 1; 3037 else 3038 ext4_msg(sb, KERN_ERR, 3039 "Cannot turn on quotas: error %d", ret); 3040 } 3041 3042 /* Turn on journaled quotas used for old sytle */ 3043 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 3044 if (EXT4_SB(sb)->s_qf_names[i]) { 3045 int ret = ext4_quota_on_mount(sb, i); 3046 3047 if (!ret) 3048 quota_update = 1; 3049 else 3050 ext4_msg(sb, KERN_ERR, 3051 "Cannot turn on journaled " 3052 "quota: type %d: error %d", i, ret); 3053 } 3054 } 3055#endif 3056 3057 while (es->s_last_orphan) { 3058 struct inode *inode; 3059 3060 /* 3061 * We may have encountered an error during cleanup; if 3062 * so, skip the rest. 3063 */ 3064 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 3065 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 3066 es->s_last_orphan = 0; 3067 break; 3068 } 3069 3070 inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); 3071 if (IS_ERR(inode)) { 3072 es->s_last_orphan = 0; 3073 break; 3074 } 3075 3076 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 3077 dquot_initialize(inode); 3078 if (inode->i_nlink) { 3079 if (test_opt(sb, DEBUG)) 3080 ext4_msg(sb, KERN_DEBUG, 3081 "%s: truncating inode %lu to %lld bytes", 3082 __func__, inode->i_ino, inode->i_size); 3083 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 3084 inode->i_ino, inode->i_size); 3085 inode_lock(inode); 3086 truncate_inode_pages(inode->i_mapping, inode->i_size); 3087 ret = ext4_truncate(inode); 3088 if (ret) { 3089 /* 3090 * We need to clean up the in-core orphan list 3091 * manually if ext4_truncate() failed to get a 3092 * transaction handle. 3093 */ 3094 ext4_orphan_del(NULL, inode); 3095 ext4_std_error(inode->i_sb, ret); 3096 } 3097 inode_unlock(inode); 3098 nr_truncates++; 3099 } else { 3100 if (test_opt(sb, DEBUG)) 3101 ext4_msg(sb, KERN_DEBUG, 3102 "%s: deleting unreferenced inode %lu", 3103 __func__, inode->i_ino); 3104 jbd_debug(2, "deleting unreferenced inode %lu\n", 3105 inode->i_ino); 3106 nr_orphans++; 3107 } 3108 iput(inode); /* The delete magic happens here! */ 3109 } 3110 3111#define PLURAL(x) (x), ((x) == 1) ? "" : "s" 3112 3113 if (nr_orphans) 3114 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", 3115 PLURAL(nr_orphans)); 3116 if (nr_truncates) 3117 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", 3118 PLURAL(nr_truncates)); 3119#ifdef CONFIG_QUOTA 3120 /* Turn off quotas if they were enabled for orphan cleanup */ 3121 if (quota_update) { 3122 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 3123 if (sb_dqopt(sb)->files[i]) 3124 dquot_quota_off(sb, i); 3125 } 3126 } 3127#endif 3128 sb->s_flags = s_flags; /* Restore SB_RDONLY status */ 3129} 3130 3131/* 3132 * Maximal extent format file size. 3133 * Resulting logical blkno at s_maxbytes must fit in our on-disk 3134 * extent format containers, within a sector_t, and within i_blocks 3135 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, 3136 * so that won't be a limiting factor. 3137 * 3138 * However there is other limiting factor. We do store extents in the form 3139 * of starting block and length, hence the resulting length of the extent 3140 * covering maximum file size must fit into on-disk format containers as 3141 * well. Given that length is always by 1 unit bigger than max unit (because 3142 * we count 0 as well) we have to lower the s_maxbytes by one fs block. 3143 * 3144 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 3145 */ 3146static loff_t ext4_max_size(int blkbits, int has_huge_files) 3147{ 3148 loff_t res; 3149 loff_t upper_limit = MAX_LFS_FILESIZE; 3150 3151 BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); 3152 3153 if (!has_huge_files) { 3154 upper_limit = (1LL << 32) - 1; 3155 3156 /* total blocks in file system block size */ 3157 upper_limit >>= (blkbits - 9); 3158 upper_limit <<= blkbits; 3159 } 3160 3161 /* 3162 * 32-bit extent-start container, ee_block. We lower the maxbytes 3163 * by one fs block, so ee_len can cover the extent of maximum file 3164 * size 3165 */ 3166 res = (1LL << 32) - 1; 3167 res <<= blkbits; 3168 3169 /* Sanity check against vm- & vfs- imposed limits */ 3170 if (res > upper_limit) 3171 res = upper_limit; 3172 3173 return res; 3174} 3175 3176/* 3177 * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect 3178 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 3179 * We need to be 1 filesystem block less than the 2^48 sector limit. 3180 */ 3181static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) 3182{ 3183 unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; 3184 int meta_blocks; 3185 3186 /* 3187 * This is calculated to be the largest file size for a dense, block 3188 * mapped file such that the file's total number of 512-byte sectors, 3189 * including data and all indirect blocks, does not exceed (2^48 - 1). 3190 * 3191 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total 3192 * number of 512-byte sectors of the file. 3193 */ 3194 if (!has_huge_files) { 3195 /* 3196 * !has_huge_files or implies that the inode i_block field 3197 * represents total file blocks in 2^32 512-byte sectors == 3198 * size of vfs inode i_blocks * 8 3199 */ 3200 upper_limit = (1LL << 32) - 1; 3201 3202 /* total blocks in file system block size */ 3203 upper_limit >>= (bits - 9); 3204 3205 } else { 3206 /* 3207 * We use 48 bit ext4_inode i_blocks 3208 * With EXT4_HUGE_FILE_FL set the i_blocks 3209 * represent total number of blocks in 3210 * file system block size 3211 */ 3212 upper_limit = (1LL << 48) - 1; 3213 3214 } 3215 3216 /* indirect blocks */ 3217 meta_blocks = 1; 3218 /* double indirect blocks */ 3219 meta_blocks += 1 + (1LL << (bits-2)); 3220 /* tripple indirect blocks */ 3221 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); 3222 3223 upper_limit -= meta_blocks; 3224 upper_limit <<= bits; 3225 3226 res += 1LL << (bits-2); 3227 res += 1LL << (2*(bits-2)); 3228 res += 1LL << (3*(bits-2)); 3229 res <<= bits; 3230 if (res > upper_limit) 3231 res = upper_limit; 3232 3233 if (res > MAX_LFS_FILESIZE) 3234 res = MAX_LFS_FILESIZE; 3235 3236 return (loff_t)res; 3237} 3238 3239static ext4_fsblk_t descriptor_loc(struct super_block *sb, 3240 ext4_fsblk_t logical_sb_block, int nr) 3241{ 3242 struct ext4_sb_info *sbi = EXT4_SB(sb); 3243 ext4_group_t bg, first_meta_bg; 3244 int has_super = 0; 3245 3246 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); 3247 3248 if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) 3249 return logical_sb_block + nr + 1; 3250 bg = sbi->s_desc_per_block * nr; 3251 if (ext4_bg_has_super(sb, bg)) 3252 has_super = 1; 3253 3254 /* 3255 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at 3256 * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled 3257 * on modern mke2fs or blksize > 1k on older mke2fs) then we must 3258 * compensate. 3259 */ 3260 if (sb->s_blocksize == 1024 && nr == 0 && 3261 le32_to_cpu(sbi->s_es->s_first_data_block) == 0) 3262 has_super++; 3263 3264 return (has_super + ext4_group_first_block_no(sb, bg)); 3265} 3266 3267/** 3268 * ext4_get_stripe_size: Get the stripe size. 3269 * @sbi: In memory super block info 3270 * 3271 * If we have specified it via mount option, then 3272 * use the mount option value. If the value specified at mount time is 3273 * greater than the blocks per group use the super block value. 3274 * If the super block value is greater than blocks per group return 0. 3275 * Allocator needs it be less than blocks per group. 3276 * 3277 */ 3278static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) 3279{ 3280 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); 3281 unsigned long stripe_width = 3282 le32_to_cpu(sbi->s_es->s_raid_stripe_width); 3283 int ret; 3284 3285 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) 3286 ret = sbi->s_stripe; 3287 else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) 3288 ret = stripe_width; 3289 else if (stride && stride <= sbi->s_blocks_per_group) 3290 ret = stride; 3291 else 3292 ret = 0; 3293 3294 /* 3295 * If the stripe width is 1, this makes no sense and 3296 * we set it to 0 to turn off stripe handling code. 3297 */ 3298 if (ret <= 1) 3299 ret = 0; 3300 3301 return ret; 3302} 3303 3304/* 3305 * Check whether this filesystem can be mounted based on 3306 * the features present and the RDONLY/RDWR mount requested. 3307 * Returns 1 if this filesystem can be mounted as requested, 3308 * 0 if it cannot be. 3309 */ 3310static int ext4_feature_set_ok(struct super_block *sb, int readonly) 3311{ 3312 if (ext4_has_unknown_ext4_incompat_features(sb)) { 3313 ext4_msg(sb, KERN_ERR, 3314 "Couldn't mount because of " 3315 "unsupported optional features (%x)", 3316 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & 3317 ~EXT4_FEATURE_INCOMPAT_SUPP)); 3318 return 0; 3319 } 3320 3321#ifndef CONFIG_UNICODE 3322 if (ext4_has_feature_casefold(sb)) { 3323 ext4_msg(sb, KERN_ERR, 3324 "Filesystem with casefold feature cannot be " 3325 "mounted without CONFIG_UNICODE"); 3326 return 0; 3327 } 3328#endif 3329 3330 if (readonly) 3331 return 1; 3332 3333 if (ext4_has_feature_readonly(sb)) { 3334 ext4_msg(sb, KERN_INFO, "filesystem is read-only"); 3335 sb->s_flags |= SB_RDONLY; 3336 return 1; 3337 } 3338 3339 /* Check that feature set is OK for a read-write mount */ 3340 if (ext4_has_unknown_ext4_ro_compat_features(sb)) { 3341 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " 3342 "unsupported optional features (%x)", 3343 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & 3344 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 3345 return 0; 3346 } 3347 if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { 3348 ext4_msg(sb, KERN_ERR, 3349 "Can't support bigalloc feature without " 3350 "extents feature\n"); 3351 return 0; 3352 } 3353 3354#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) 3355 if (!readonly && (ext4_has_feature_quota(sb) || 3356 ext4_has_feature_project(sb))) { 3357 ext4_msg(sb, KERN_ERR, 3358 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); 3359 return 0; 3360 } 3361#endif /* CONFIG_QUOTA */ 3362 return 1; 3363} 3364 3365/* 3366 * This function is called once a day if we have errors logged 3367 * on the file system 3368 */ 3369static void print_daily_error_info(struct timer_list *t) 3370{ 3371 struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); 3372 struct super_block *sb = sbi->s_sb; 3373 struct ext4_super_block *es = sbi->s_es; 3374 3375 if (es->s_error_count) 3376 /* fsck newer than v1.41.13 is needed to clean this condition. */ 3377 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", 3378 le32_to_cpu(es->s_error_count)); 3379 if (es->s_first_error_time) { 3380 printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", 3381 sb->s_id, 3382 ext4_get_tstamp(es, s_first_error_time), 3383 (int) sizeof(es->s_first_error_func), 3384 es->s_first_error_func, 3385 le32_to_cpu(es->s_first_error_line)); 3386 if (es->s_first_error_ino) 3387 printk(KERN_CONT ": inode %u", 3388 le32_to_cpu(es->s_first_error_ino)); 3389 if (es->s_first_error_block) 3390 printk(KERN_CONT ": block %llu", (unsigned long long) 3391 le64_to_cpu(es->s_first_error_block)); 3392 printk(KERN_CONT "\n"); 3393 } 3394 if (es->s_last_error_time) { 3395 printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", 3396 sb->s_id, 3397 ext4_get_tstamp(es, s_last_error_time), 3398 (int) sizeof(es->s_last_error_func), 3399 es->s_last_error_func, 3400 le32_to_cpu(es->s_last_error_line)); 3401 if (es->s_last_error_ino) 3402 printk(KERN_CONT ": inode %u", 3403 le32_to_cpu(es->s_last_error_ino)); 3404 if (es->s_last_error_block) 3405 printk(KERN_CONT ": block %llu", (unsigned long long) 3406 le64_to_cpu(es->s_last_error_block)); 3407 printk(KERN_CONT "\n"); 3408 } 3409 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 3410} 3411 3412/* Find next suitable group and run ext4_init_inode_table */ 3413static int ext4_run_li_request(struct ext4_li_request *elr) 3414{ 3415 struct ext4_group_desc *gdp = NULL; 3416 struct super_block *sb = elr->lr_super; 3417 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 3418 ext4_group_t group = elr->lr_next_group; 3419 unsigned int prefetch_ios = 0; 3420 int ret = 0; 3421 u64 start_time; 3422 3423 if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { 3424 elr->lr_next_group = ext4_mb_prefetch(sb, group, 3425 EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); 3426 if (prefetch_ios) 3427 ext4_mb_prefetch_fini(sb, elr->lr_next_group, 3428 prefetch_ios); 3429 trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, 3430 prefetch_ios); 3431 if (group >= elr->lr_next_group) { 3432 ret = 1; 3433 if (elr->lr_first_not_zeroed != ngroups && 3434 !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { 3435 elr->lr_next_group = elr->lr_first_not_zeroed; 3436 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3437 ret = 0; 3438 } 3439 } 3440 return ret; 3441 } 3442 3443 for (; group < ngroups; group++) { 3444 gdp = ext4_get_group_desc(sb, group, NULL); 3445 if (!gdp) { 3446 ret = 1; 3447 break; 3448 } 3449 3450 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) 3451 break; 3452 } 3453 3454 if (group >= ngroups) 3455 ret = 1; 3456 3457 if (!ret) { 3458 start_time = ktime_get_real_ns(); 3459 ret = ext4_init_inode_table(sb, group, 3460 elr->lr_timeout ? 0 : 1); 3461 trace_ext4_lazy_itable_init(sb, group); 3462 if (elr->lr_timeout == 0) { 3463 elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * 3464 EXT4_SB(elr->lr_super)->s_li_wait_mult); 3465 } 3466 elr->lr_next_sched = jiffies + elr->lr_timeout; 3467 elr->lr_next_group = group + 1; 3468 } 3469 return ret; 3470} 3471 3472/* 3473 * Remove lr_request from the list_request and free the 3474 * request structure. Should be called with li_list_mtx held 3475 */ 3476static void ext4_remove_li_request(struct ext4_li_request *elr) 3477{ 3478 if (!elr) 3479 return; 3480 3481 list_del(&elr->lr_request); 3482 EXT4_SB(elr->lr_super)->s_li_request = NULL; 3483 kfree(elr); 3484} 3485 3486static void ext4_unregister_li_request(struct super_block *sb) 3487{ 3488 mutex_lock(&ext4_li_mtx); 3489 if (!ext4_li_info) { 3490 mutex_unlock(&ext4_li_mtx); 3491 return; 3492 } 3493 3494 mutex_lock(&ext4_li_info->li_list_mtx); 3495 ext4_remove_li_request(EXT4_SB(sb)->s_li_request); 3496 mutex_unlock(&ext4_li_info->li_list_mtx); 3497 mutex_unlock(&ext4_li_mtx); 3498} 3499 3500static struct task_struct *ext4_lazyinit_task; 3501 3502/* 3503 * This is the function where ext4lazyinit thread lives. It walks 3504 * through the request list searching for next scheduled filesystem. 3505 * When such a fs is found, run the lazy initialization request 3506 * (ext4_rn_li_request) and keep track of the time spend in this 3507 * function. Based on that time we compute next schedule time of 3508 * the request. When walking through the list is complete, compute 3509 * next waking time and put itself into sleep. 3510 */ 3511static int ext4_lazyinit_thread(void *arg) 3512{ 3513 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; 3514 struct list_head *pos, *n; 3515 struct ext4_li_request *elr; 3516 unsigned long next_wakeup, cur; 3517 3518 BUG_ON(NULL == eli); 3519 set_freezable(); 3520 3521cont_thread: 3522 while (true) { 3523 next_wakeup = MAX_JIFFY_OFFSET; 3524 3525 mutex_lock(&eli->li_list_mtx); 3526 if (list_empty(&eli->li_request_list)) { 3527 mutex_unlock(&eli->li_list_mtx); 3528 goto exit_thread; 3529 } 3530 list_for_each_safe(pos, n, &eli->li_request_list) { 3531 int err = 0; 3532 int progress = 0; 3533 elr = list_entry(pos, struct ext4_li_request, 3534 lr_request); 3535 3536 if (time_before(jiffies, elr->lr_next_sched)) { 3537 if (time_before(elr->lr_next_sched, next_wakeup)) 3538 next_wakeup = elr->lr_next_sched; 3539 continue; 3540 } 3541 if (down_read_trylock(&elr->lr_super->s_umount)) { 3542 if (sb_start_write_trylock(elr->lr_super)) { 3543 progress = 1; 3544 /* 3545 * We hold sb->s_umount, sb can not 3546 * be removed from the list, it is 3547 * now safe to drop li_list_mtx 3548 */ 3549 mutex_unlock(&eli->li_list_mtx); 3550 err = ext4_run_li_request(elr); 3551 sb_end_write(elr->lr_super); 3552 mutex_lock(&eli->li_list_mtx); 3553 n = pos->next; 3554 } 3555 up_read((&elr->lr_super->s_umount)); 3556 } 3557 /* error, remove the lazy_init job */ 3558 if (err) { 3559 ext4_remove_li_request(elr); 3560 continue; 3561 } 3562 if (!progress) { 3563 elr->lr_next_sched = jiffies + 3564 (prandom_u32() 3565 % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); 3566 } 3567 if (time_before(elr->lr_next_sched, next_wakeup)) 3568 next_wakeup = elr->lr_next_sched; 3569 } 3570 mutex_unlock(&eli->li_list_mtx); 3571 3572 try_to_freeze(); 3573 3574 cur = jiffies; 3575 if ((time_after_eq(cur, next_wakeup)) || 3576 (MAX_JIFFY_OFFSET == next_wakeup)) { 3577 cond_resched(); 3578 continue; 3579 } 3580 3581 schedule_timeout_interruptible(next_wakeup - cur); 3582 3583 if (kthread_should_stop()) { 3584 ext4_clear_request_list(); 3585 goto exit_thread; 3586 } 3587 } 3588 3589exit_thread: 3590 /* 3591 * It looks like the request list is empty, but we need 3592 * to check it under the li_list_mtx lock, to prevent any 3593 * additions into it, and of course we should lock ext4_li_mtx 3594 * to atomically free the list and ext4_li_info, because at 3595 * this point another ext4 filesystem could be registering 3596 * new one. 3597 */ 3598 mutex_lock(&ext4_li_mtx); 3599 mutex_lock(&eli->li_list_mtx); 3600 if (!list_empty(&eli->li_request_list)) { 3601 mutex_unlock(&eli->li_list_mtx); 3602 mutex_unlock(&ext4_li_mtx); 3603 goto cont_thread; 3604 } 3605 mutex_unlock(&eli->li_list_mtx); 3606 kfree(ext4_li_info); 3607 ext4_li_info = NULL; 3608 mutex_unlock(&ext4_li_mtx); 3609 3610 return 0; 3611} 3612 3613static void ext4_clear_request_list(void) 3614{ 3615 struct list_head *pos, *n; 3616 struct ext4_li_request *elr; 3617 3618 mutex_lock(&ext4_li_info->li_list_mtx); 3619 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { 3620 elr = list_entry(pos, struct ext4_li_request, 3621 lr_request); 3622 ext4_remove_li_request(elr); 3623 } 3624 mutex_unlock(&ext4_li_info->li_list_mtx); 3625} 3626 3627static int ext4_run_lazyinit_thread(void) 3628{ 3629 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, 3630 ext4_li_info, "ext4lazyinit"); 3631 if (IS_ERR(ext4_lazyinit_task)) { 3632 int err = PTR_ERR(ext4_lazyinit_task); 3633 ext4_clear_request_list(); 3634 kfree(ext4_li_info); 3635 ext4_li_info = NULL; 3636 printk(KERN_CRIT "EXT4-fs: error %d creating inode table " 3637 "initialization thread\n", 3638 err); 3639 return err; 3640 } 3641 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; 3642 return 0; 3643} 3644 3645/* 3646 * Check whether it make sense to run itable init. thread or not. 3647 * If there is at least one uninitialized inode table, return 3648 * corresponding group number, else the loop goes through all 3649 * groups and return total number of groups. 3650 */ 3651static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) 3652{ 3653 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; 3654 struct ext4_group_desc *gdp = NULL; 3655 3656 if (!ext4_has_group_desc_csum(sb)) 3657 return ngroups; 3658 3659 for (group = 0; group < ngroups; group++) { 3660 gdp = ext4_get_group_desc(sb, group, NULL); 3661 if (!gdp) 3662 continue; 3663 3664 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) 3665 break; 3666 } 3667 3668 return group; 3669} 3670 3671static int ext4_li_info_new(void) 3672{ 3673 struct ext4_lazy_init *eli = NULL; 3674 3675 eli = kzalloc(sizeof(*eli), GFP_KERNEL); 3676 if (!eli) 3677 return -ENOMEM; 3678 3679 INIT_LIST_HEAD(&eli->li_request_list); 3680 mutex_init(&eli->li_list_mtx); 3681 3682 eli->li_state |= EXT4_LAZYINIT_QUIT; 3683 3684 ext4_li_info = eli; 3685 3686 return 0; 3687} 3688 3689static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, 3690 ext4_group_t start) 3691{ 3692 struct ext4_li_request *elr; 3693 3694 elr = kzalloc(sizeof(*elr), GFP_KERNEL); 3695 if (!elr) 3696 return NULL; 3697 3698 elr->lr_super = sb; 3699 elr->lr_first_not_zeroed = start; 3700 if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) 3701 elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; 3702 else { 3703 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3704 elr->lr_next_group = start; 3705 } 3706 3707 /* 3708 * Randomize first schedule time of the request to 3709 * spread the inode table initialization requests 3710 * better. 3711 */ 3712 elr->lr_next_sched = jiffies + (prandom_u32() % 3713 (EXT4_DEF_LI_MAX_START_DELAY * HZ)); 3714 return elr; 3715} 3716 3717int ext4_register_li_request(struct super_block *sb, 3718 ext4_group_t first_not_zeroed) 3719{ 3720 struct ext4_sb_info *sbi = EXT4_SB(sb); 3721 struct ext4_li_request *elr = NULL; 3722 ext4_group_t ngroups = sbi->s_groups_count; 3723 int ret = 0; 3724 3725 mutex_lock(&ext4_li_mtx); 3726 if (sbi->s_li_request != NULL) { 3727 /* 3728 * Reset timeout so it can be computed again, because 3729 * s_li_wait_mult might have changed. 3730 */ 3731 sbi->s_li_request->lr_timeout = 0; 3732 goto out; 3733 } 3734 3735 if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && 3736 (first_not_zeroed == ngroups || sb_rdonly(sb) || 3737 !test_opt(sb, INIT_INODE_TABLE))) 3738 goto out; 3739 3740 elr = ext4_li_request_new(sb, first_not_zeroed); 3741 if (!elr) { 3742 ret = -ENOMEM; 3743 goto out; 3744 } 3745 3746 if (NULL == ext4_li_info) { 3747 ret = ext4_li_info_new(); 3748 if (ret) 3749 goto out; 3750 } 3751 3752 mutex_lock(&ext4_li_info->li_list_mtx); 3753 list_add(&elr->lr_request, &ext4_li_info->li_request_list); 3754 mutex_unlock(&ext4_li_info->li_list_mtx); 3755 3756 sbi->s_li_request = elr; 3757 /* 3758 * set elr to NULL here since it has been inserted to 3759 * the request_list and the removal and free of it is 3760 * handled by ext4_clear_request_list from now on. 3761 */ 3762 elr = NULL; 3763 3764 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { 3765 ret = ext4_run_lazyinit_thread(); 3766 if (ret) 3767 goto out; 3768 } 3769out: 3770 mutex_unlock(&ext4_li_mtx); 3771 if (ret) 3772 kfree(elr); 3773 return ret; 3774} 3775 3776/* 3777 * We do not need to lock anything since this is called on 3778 * module unload. 3779 */ 3780static void ext4_destroy_lazyinit_thread(void) 3781{ 3782 /* 3783 * If thread exited earlier 3784 * there's nothing to be done. 3785 */ 3786 if (!ext4_li_info || !ext4_lazyinit_task) 3787 return; 3788 3789 kthread_stop(ext4_lazyinit_task); 3790} 3791 3792static int set_journal_csum_feature_set(struct super_block *sb) 3793{ 3794 int ret = 1; 3795 int compat, incompat; 3796 struct ext4_sb_info *sbi = EXT4_SB(sb); 3797 3798 if (ext4_has_metadata_csum(sb)) { 3799 /* journal checksum v3 */ 3800 compat = 0; 3801 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; 3802 } else { 3803 /* journal checksum v1 */ 3804 compat = JBD2_FEATURE_COMPAT_CHECKSUM; 3805 incompat = 0; 3806 } 3807 3808 jbd2_journal_clear_features(sbi->s_journal, 3809 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3810 JBD2_FEATURE_INCOMPAT_CSUM_V3 | 3811 JBD2_FEATURE_INCOMPAT_CSUM_V2); 3812 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 3813 ret = jbd2_journal_set_features(sbi->s_journal, 3814 compat, 0, 3815 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | 3816 incompat); 3817 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 3818 ret = jbd2_journal_set_features(sbi->s_journal, 3819 compat, 0, 3820 incompat); 3821 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3822 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3823 } else { 3824 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3825 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3826 } 3827 3828 return ret; 3829} 3830 3831/* 3832 * Note: calculating the overhead so we can be compatible with 3833 * historical BSD practice is quite difficult in the face of 3834 * clusters/bigalloc. This is because multiple metadata blocks from 3835 * different block group can end up in the same allocation cluster. 3836 * Calculating the exact overhead in the face of clustered allocation 3837 * requires either O(all block bitmaps) in memory or O(number of block 3838 * groups**2) in time. We will still calculate the superblock for 3839 * older file systems --- and if we come across with a bigalloc file 3840 * system with zero in s_overhead_clusters the estimate will be close to 3841 * correct especially for very large cluster sizes --- but for newer 3842 * file systems, it's better to calculate this figure once at mkfs 3843 * time, and store it in the superblock. If the superblock value is 3844 * present (even for non-bigalloc file systems), we will use it. 3845 */ 3846static int count_overhead(struct super_block *sb, ext4_group_t grp, 3847 char *buf) 3848{ 3849 struct ext4_sb_info *sbi = EXT4_SB(sb); 3850 struct ext4_group_desc *gdp; 3851 ext4_fsblk_t first_block, last_block, b; 3852 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3853 int s, j, count = 0; 3854 int has_super = ext4_bg_has_super(sb, grp); 3855 3856 if (!ext4_has_feature_bigalloc(sb)) 3857 return (has_super + ext4_bg_num_gdb(sb, grp) + 3858 (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + 3859 sbi->s_itb_per_group + 2); 3860 3861 first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + 3862 (grp * EXT4_BLOCKS_PER_GROUP(sb)); 3863 last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; 3864 for (i = 0; i < ngroups; i++) { 3865 gdp = ext4_get_group_desc(sb, i, NULL); 3866 b = ext4_block_bitmap(sb, gdp); 3867 if (b >= first_block && b <= last_block) { 3868 ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); 3869 count++; 3870 } 3871 b = ext4_inode_bitmap(sb, gdp); 3872 if (b >= first_block && b <= last_block) { 3873 ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); 3874 count++; 3875 } 3876 b = ext4_inode_table(sb, gdp); 3877 if (b >= first_block && b + sbi->s_itb_per_group <= last_block) 3878 for (j = 0; j < sbi->s_itb_per_group; j++, b++) { 3879 int c = EXT4_B2C(sbi, b - first_block); 3880 ext4_set_bit(c, buf); 3881 count++; 3882 } 3883 if (i != grp) 3884 continue; 3885 s = 0; 3886 if (ext4_bg_has_super(sb, grp)) { 3887 ext4_set_bit(s++, buf); 3888 count++; 3889 } 3890 j = ext4_bg_num_gdb(sb, grp); 3891 if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { 3892 ext4_error(sb, "Invalid number of block group " 3893 "descriptor blocks: %d", j); 3894 j = EXT4_BLOCKS_PER_GROUP(sb) - s; 3895 } 3896 count += j; 3897 for (; j > 0; j--) 3898 ext4_set_bit(EXT4_B2C(sbi, s++), buf); 3899 } 3900 if (!count) 3901 return 0; 3902 return EXT4_CLUSTERS_PER_GROUP(sb) - 3903 ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); 3904} 3905 3906/* 3907 * Compute the overhead and stash it in sbi->s_overhead 3908 */ 3909int ext4_calculate_overhead(struct super_block *sb) 3910{ 3911 struct ext4_sb_info *sbi = EXT4_SB(sb); 3912 struct ext4_super_block *es = sbi->s_es; 3913 struct inode *j_inode; 3914 unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); 3915 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3916 ext4_fsblk_t overhead = 0; 3917 char *buf = (char *) get_zeroed_page(GFP_NOFS); 3918 3919 if (!buf) 3920 return -ENOMEM; 3921 3922 /* 3923 * Compute the overhead (FS structures). This is constant 3924 * for a given filesystem unless the number of block groups 3925 * changes so we cache the previous value until it does. 3926 */ 3927 3928 /* 3929 * All of the blocks before first_data_block are overhead 3930 */ 3931 overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); 3932 3933 /* 3934 * Add the overhead found in each block group 3935 */ 3936 for (i = 0; i < ngroups; i++) { 3937 int blks; 3938 3939 blks = count_overhead(sb, i, buf); 3940 overhead += blks; 3941 if (blks) 3942 memset(buf, 0, PAGE_SIZE); 3943 cond_resched(); 3944 } 3945 3946 /* 3947 * Add the internal journal blocks whether the journal has been 3948 * loaded or not 3949 */ 3950 if (sbi->s_journal && !sbi->s_journal_bdev) 3951 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); 3952 else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { 3953 /* j_inum for internal journal is non-zero */ 3954 j_inode = ext4_get_journal_inode(sb, j_inum); 3955 if (j_inode) { 3956 j_blocks = j_inode->i_size >> sb->s_blocksize_bits; 3957 overhead += EXT4_NUM_B2C(sbi, j_blocks); 3958 iput(j_inode); 3959 } else { 3960 ext4_msg(sb, KERN_ERR, "can't get journal size"); 3961 } 3962 } 3963 sbi->s_overhead = overhead; 3964 smp_wmb(); 3965 free_page((unsigned long) buf); 3966 return 0; 3967} 3968 3969static void ext4_set_resv_clusters(struct super_block *sb) 3970{ 3971 ext4_fsblk_t resv_clusters; 3972 struct ext4_sb_info *sbi = EXT4_SB(sb); 3973 3974 /* 3975 * There's no need to reserve anything when we aren't using extents. 3976 * The space estimates are exact, there are no unwritten extents, 3977 * hole punching doesn't need new metadata... This is needed especially 3978 * to keep ext2/3 backward compatibility. 3979 */ 3980 if (!ext4_has_feature_extents(sb)) 3981 return; 3982 /* 3983 * By default we reserve 2% or 4096 clusters, whichever is smaller. 3984 * This should cover the situations where we can not afford to run 3985 * out of space like for example punch hole, or converting 3986 * unwritten extents in delalloc path. In most cases such 3987 * allocation would require 1, or 2 blocks, higher numbers are 3988 * very rare. 3989 */ 3990 resv_clusters = (ext4_blocks_count(sbi->s_es) >> 3991 sbi->s_cluster_bits); 3992 3993 do_div(resv_clusters, 50); 3994 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); 3995 3996 atomic64_set(&sbi->s_resv_clusters, resv_clusters); 3997} 3998 3999static int ext4_fill_super(struct super_block *sb, void *data, int silent) 4000{ 4001 struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); 4002 char *orig_data = kstrdup(data, GFP_KERNEL); 4003 struct buffer_head *bh, **group_desc; 4004 struct ext4_super_block *es = NULL; 4005 struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 4006 struct flex_groups **flex_groups; 4007 ext4_fsblk_t block; 4008 ext4_fsblk_t sb_block = get_sb_block(&data); 4009 ext4_fsblk_t logical_sb_block; 4010 unsigned long offset = 0; 4011 unsigned long journal_devnum = 0; 4012 unsigned long def_mount_opts; 4013 struct inode *root; 4014 const char *descr; 4015 int ret = -ENOMEM; 4016 int blocksize, clustersize; 4017 unsigned int db_count; 4018 unsigned int i; 4019 int needs_recovery, has_huge_files; 4020 __u64 blocks_count; 4021 int err = 0; 4022 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4023 ext4_group_t first_not_zeroed; 4024 4025 if ((data && !orig_data) || !sbi) 4026 goto out_free_base; 4027 4028 sbi->s_daxdev = dax_dev; 4029 sbi->s_blockgroup_lock = 4030 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 4031 if (!sbi->s_blockgroup_lock) 4032 goto out_free_base; 4033 4034 sb->s_fs_info = sbi; 4035 sbi->s_sb = sb; 4036 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 4037 sbi->s_sb_block = sb_block; 4038 if (sb->s_bdev->bd_part) 4039 sbi->s_sectors_written_start = 4040 part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]); 4041 4042 /* Cleanup superblock name */ 4043 strreplace(sb->s_id, '/', '!'); 4044 4045 /* -EINVAL is default */ 4046 ret = -EINVAL; 4047 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 4048 if (!blocksize) { 4049 ext4_msg(sb, KERN_ERR, "unable to set blocksize"); 4050 goto out_fail; 4051 } 4052 4053 /* 4054 * The ext4 superblock will not be buffer aligned for other than 1kB 4055 * block sizes. We need to calculate the offset from buffer start. 4056 */ 4057 if (blocksize != EXT4_MIN_BLOCK_SIZE) { 4058 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 4059 offset = do_div(logical_sb_block, blocksize); 4060 } else { 4061 logical_sb_block = sb_block; 4062 } 4063 4064 bh = ext4_sb_bread_unmovable(sb, logical_sb_block); 4065 if (IS_ERR(bh)) { 4066 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 4067 ret = PTR_ERR(bh); 4068 bh = NULL; 4069 goto out_fail; 4070 } 4071 /* 4072 * Note: s_es must be initialized as soon as possible because 4073 * some ext4 macro-instructions depend on its value 4074 */ 4075 es = (struct ext4_super_block *) (bh->b_data + offset); 4076 sbi->s_es = es; 4077 sb->s_magic = le16_to_cpu(es->s_magic); 4078 if (sb->s_magic != EXT4_SUPER_MAGIC) 4079 goto cantfind_ext4; 4080 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); 4081 4082 /* Warn if metadata_csum and gdt_csum are both set. */ 4083 if (ext4_has_feature_metadata_csum(sb) && 4084 ext4_has_feature_gdt_csum(sb)) 4085 ext4_warning(sb, "metadata_csum and uninit_bg are " 4086 "redundant flags; please run fsck."); 4087 4088 /* Check for a known checksum algorithm */ 4089 if (!ext4_verify_csum_type(sb, es)) { 4090 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " 4091 "unknown checksum algorithm."); 4092 silent = 1; 4093 goto cantfind_ext4; 4094 } 4095 4096 /* Load the checksum driver */ 4097 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 4098 if (IS_ERR(sbi->s_chksum_driver)) { 4099 ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); 4100 ret = PTR_ERR(sbi->s_chksum_driver); 4101 sbi->s_chksum_driver = NULL; 4102 goto failed_mount; 4103 } 4104 4105 /* Check superblock checksum */ 4106 if (!ext4_superblock_csum_verify(sb, es)) { 4107 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " 4108 "invalid superblock checksum. Run e2fsck?"); 4109 silent = 1; 4110 ret = -EFSBADCRC; 4111 goto cantfind_ext4; 4112 } 4113 4114 /* Precompute checksum seed for all metadata */ 4115 if (ext4_has_feature_csum_seed(sb)) 4116 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); 4117 else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) 4118 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, 4119 sizeof(es->s_uuid)); 4120 4121 /* Set defaults before we parse the mount options */ 4122 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 4123 set_opt(sb, INIT_INODE_TABLE); 4124 if (def_mount_opts & EXT4_DEFM_DEBUG) 4125 set_opt(sb, DEBUG); 4126 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 4127 set_opt(sb, GRPID); 4128 if (def_mount_opts & EXT4_DEFM_UID16) 4129 set_opt(sb, NO_UID32); 4130 /* xattr user namespace & acls are now defaulted on */ 4131 set_opt(sb, XATTR_USER); 4132#ifdef CONFIG_EXT4_FS_POSIX_ACL 4133 set_opt(sb, POSIX_ACL); 4134#endif 4135 if (ext4_has_feature_fast_commit(sb)) 4136 set_opt2(sb, JOURNAL_FAST_COMMIT); 4137 /* don't forget to enable journal_csum when metadata_csum is enabled. */ 4138 if (ext4_has_metadata_csum(sb)) 4139 set_opt(sb, JOURNAL_CHECKSUM); 4140 4141 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 4142 set_opt(sb, JOURNAL_DATA); 4143 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 4144 set_opt(sb, ORDERED_DATA); 4145 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 4146 set_opt(sb, WRITEBACK_DATA); 4147 4148 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 4149 set_opt(sb, ERRORS_PANIC); 4150 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) 4151 set_opt(sb, ERRORS_CONT); 4152 else 4153 set_opt(sb, ERRORS_RO); 4154 /* block_validity enabled by default; disable with noblock_validity */ 4155 set_opt(sb, BLOCK_VALIDITY); 4156 if (def_mount_opts & EXT4_DEFM_DISCARD) 4157 set_opt(sb, DISCARD); 4158 4159 sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); 4160 sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); 4161 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 4162 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 4163 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 4164 4165 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) 4166 set_opt(sb, BARRIER); 4167 4168 /* 4169 * enable delayed allocation by default 4170 * Use -o nodelalloc to turn it off 4171 */ 4172 if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && 4173 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 4174 set_opt(sb, DELALLOC); 4175 4176 /* 4177 * set default s_li_wait_mult for lazyinit, for the case there is 4178 * no mount option specified. 4179 */ 4180 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 4181 4182 if (le32_to_cpu(es->s_log_block_size) > 4183 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4184 ext4_msg(sb, KERN_ERR, 4185 "Invalid log block size: %u", 4186 le32_to_cpu(es->s_log_block_size)); 4187 goto failed_mount; 4188 } 4189 if (le32_to_cpu(es->s_log_cluster_size) > 4190 (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4191 ext4_msg(sb, KERN_ERR, 4192 "Invalid log cluster size: %u", 4193 le32_to_cpu(es->s_log_cluster_size)); 4194 goto failed_mount; 4195 } 4196 4197 blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 4198 4199 if (blocksize == PAGE_SIZE) 4200 set_opt(sb, DIOREAD_NOLOCK); 4201 4202 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 4203 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; 4204 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; 4205 } else { 4206 sbi->s_inode_size = le16_to_cpu(es->s_inode_size); 4207 sbi->s_first_ino = le32_to_cpu(es->s_first_ino); 4208 if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { 4209 ext4_msg(sb, KERN_ERR, "invalid first ino: %u", 4210 sbi->s_first_ino); 4211 goto failed_mount; 4212 } 4213 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || 4214 (!is_power_of_2(sbi->s_inode_size)) || 4215 (sbi->s_inode_size > blocksize)) { 4216 ext4_msg(sb, KERN_ERR, 4217 "unsupported inode size: %d", 4218 sbi->s_inode_size); 4219 ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); 4220 goto failed_mount; 4221 } 4222 /* 4223 * i_atime_extra is the last extra field available for 4224 * [acm]times in struct ext4_inode. Checking for that 4225 * field should suffice to ensure we have extra space 4226 * for all three. 4227 */ 4228 if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + 4229 sizeof(((struct ext4_inode *)0)->i_atime_extra)) { 4230 sb->s_time_gran = 1; 4231 sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; 4232 } else { 4233 sb->s_time_gran = NSEC_PER_SEC; 4234 sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; 4235 } 4236 sb->s_time_min = EXT4_TIMESTAMP_MIN; 4237 } 4238 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { 4239 sbi->s_want_extra_isize = sizeof(struct ext4_inode) - 4240 EXT4_GOOD_OLD_INODE_SIZE; 4241 if (ext4_has_feature_extra_isize(sb)) { 4242 unsigned v, max = (sbi->s_inode_size - 4243 EXT4_GOOD_OLD_INODE_SIZE); 4244 4245 v = le16_to_cpu(es->s_want_extra_isize); 4246 if (v > max) { 4247 ext4_msg(sb, KERN_ERR, 4248 "bad s_want_extra_isize: %d", v); 4249 goto failed_mount; 4250 } 4251 if (sbi->s_want_extra_isize < v) 4252 sbi->s_want_extra_isize = v; 4253 4254 v = le16_to_cpu(es->s_min_extra_isize); 4255 if (v > max) { 4256 ext4_msg(sb, KERN_ERR, 4257 "bad s_min_extra_isize: %d", v); 4258 goto failed_mount; 4259 } 4260 if (sbi->s_want_extra_isize < v) 4261 sbi->s_want_extra_isize = v; 4262 } 4263 } 4264 4265 if (sbi->s_es->s_mount_opts[0]) { 4266 char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, 4267 sizeof(sbi->s_es->s_mount_opts), 4268 GFP_KERNEL); 4269 if (!s_mount_opts) 4270 goto failed_mount; 4271 if (!parse_options(s_mount_opts, sb, &journal_devnum, 4272 &journal_ioprio, 0)) { 4273 ext4_msg(sb, KERN_WARNING, 4274 "failed to parse options in superblock: %s", 4275 s_mount_opts); 4276 } 4277 kfree(s_mount_opts); 4278 } 4279 sbi->s_def_mount_opt = sbi->s_mount_opt; 4280 if (!parse_options((char *) data, sb, &journal_devnum, 4281 &journal_ioprio, 0)) 4282 goto failed_mount; 4283 4284#ifdef CONFIG_UNICODE 4285 if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { 4286 const struct ext4_sb_encodings *encoding_info; 4287 struct unicode_map *encoding; 4288 __u16 encoding_flags; 4289 4290 if (ext4_has_feature_encrypt(sb)) { 4291 ext4_msg(sb, KERN_ERR, 4292 "Can't mount with encoding and encryption"); 4293 goto failed_mount; 4294 } 4295 4296 if (ext4_sb_read_encoding(es, &encoding_info, 4297 &encoding_flags)) { 4298 ext4_msg(sb, KERN_ERR, 4299 "Encoding requested by superblock is unknown"); 4300 goto failed_mount; 4301 } 4302 4303 encoding = utf8_load(encoding_info->version); 4304 if (IS_ERR(encoding)) { 4305 ext4_msg(sb, KERN_ERR, 4306 "can't mount with superblock charset: %s-%s " 4307 "not supported by the kernel. flags: 0x%x.", 4308 encoding_info->name, encoding_info->version, 4309 encoding_flags); 4310 goto failed_mount; 4311 } 4312 ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " 4313 "%s-%s with flags 0x%hx", encoding_info->name, 4314 encoding_info->version?:"\b", encoding_flags); 4315 4316 sb->s_encoding = encoding; 4317 sb->s_encoding_flags = encoding_flags; 4318 } 4319#endif 4320 4321 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 4322 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); 4323 /* can't mount with both data=journal and dioread_nolock. */ 4324 clear_opt(sb, DIOREAD_NOLOCK); 4325 clear_opt2(sb, JOURNAL_FAST_COMMIT); 4326 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 4327 ext4_msg(sb, KERN_ERR, "can't mount with " 4328 "both data=journal and delalloc"); 4329 goto failed_mount; 4330 } 4331 if (test_opt(sb, DAX_ALWAYS)) { 4332 ext4_msg(sb, KERN_ERR, "can't mount with " 4333 "both data=journal and dax"); 4334 goto failed_mount; 4335 } 4336 if (ext4_has_feature_encrypt(sb)) { 4337 ext4_msg(sb, KERN_WARNING, 4338 "encrypted files will use data=ordered " 4339 "instead of data journaling mode"); 4340 } 4341 if (test_opt(sb, DELALLOC)) 4342 clear_opt(sb, DELALLOC); 4343 } else { 4344 sb->s_iflags |= SB_I_CGROUPWB; 4345 } 4346 4347 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | 4348 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); 4349 4350 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 4351 (ext4_has_compat_features(sb) || 4352 ext4_has_ro_compat_features(sb) || 4353 ext4_has_incompat_features(sb))) 4354 ext4_msg(sb, KERN_WARNING, 4355 "feature flags set on rev 0 fs, " 4356 "running e2fsck is recommended"); 4357 4358 if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { 4359 set_opt2(sb, HURD_COMPAT); 4360 if (ext4_has_feature_64bit(sb)) { 4361 ext4_msg(sb, KERN_ERR, 4362 "The Hurd can't support 64-bit file systems"); 4363 goto failed_mount; 4364 } 4365 4366 /* 4367 * ea_inode feature uses l_i_version field which is not 4368 * available in HURD_COMPAT mode. 4369 */ 4370 if (ext4_has_feature_ea_inode(sb)) { 4371 ext4_msg(sb, KERN_ERR, 4372 "ea_inode feature is not supported for Hurd"); 4373 goto failed_mount; 4374 } 4375 } 4376 4377 if (IS_EXT2_SB(sb)) { 4378 if (ext2_feature_set_ok(sb)) 4379 ext4_msg(sb, KERN_INFO, "mounting ext2 file system " 4380 "using the ext4 subsystem"); 4381 else { 4382 /* 4383 * If we're probing be silent, if this looks like 4384 * it's actually an ext[34] filesystem. 4385 */ 4386 if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) 4387 goto failed_mount; 4388 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " 4389 "to feature incompatibilities"); 4390 goto failed_mount; 4391 } 4392 } 4393 4394 if (IS_EXT3_SB(sb)) { 4395 if (ext3_feature_set_ok(sb)) 4396 ext4_msg(sb, KERN_INFO, "mounting ext3 file system " 4397 "using the ext4 subsystem"); 4398 else { 4399 /* 4400 * If we're probing be silent, if this looks like 4401 * it's actually an ext4 filesystem. 4402 */ 4403 if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) 4404 goto failed_mount; 4405 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " 4406 "to feature incompatibilities"); 4407 goto failed_mount; 4408 } 4409 } 4410 4411 /* 4412 * Check feature flags regardless of the revision level, since we 4413 * previously didn't change the revision level when setting the flags, 4414 * so there is a chance incompat flags are set on a rev 0 filesystem. 4415 */ 4416 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) 4417 goto failed_mount; 4418 4419 if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { 4420 ext4_msg(sb, KERN_ERR, 4421 "Number of reserved GDT blocks insanely large: %d", 4422 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); 4423 goto failed_mount; 4424 } 4425 4426 if (bdev_dax_supported(sb->s_bdev, blocksize)) 4427 set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); 4428 4429 if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { 4430 if (ext4_has_feature_inline_data(sb)) { 4431 ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" 4432 " that may contain inline data"); 4433 goto failed_mount; 4434 } 4435 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { 4436 ext4_msg(sb, KERN_ERR, 4437 "DAX unsupported by block device."); 4438 goto failed_mount; 4439 } 4440 } 4441 4442 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { 4443 ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", 4444 es->s_encryption_level); 4445 goto failed_mount; 4446 } 4447 4448 if (sb->s_blocksize != blocksize) { 4449 /* 4450 * bh must be released before kill_bdev(), otherwise 4451 * it won't be freed and its page also. kill_bdev() 4452 * is called by sb_set_blocksize(). 4453 */ 4454 brelse(bh); 4455 /* Validate the filesystem blocksize */ 4456 if (!sb_set_blocksize(sb, blocksize)) { 4457 ext4_msg(sb, KERN_ERR, "bad block size %d", 4458 blocksize); 4459 bh = NULL; 4460 goto failed_mount; 4461 } 4462 4463 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 4464 offset = do_div(logical_sb_block, blocksize); 4465 bh = ext4_sb_bread_unmovable(sb, logical_sb_block); 4466 if (IS_ERR(bh)) { 4467 ext4_msg(sb, KERN_ERR, 4468 "Can't read superblock on 2nd try"); 4469 ret = PTR_ERR(bh); 4470 bh = NULL; 4471 goto failed_mount; 4472 } 4473 es = (struct ext4_super_block *)(bh->b_data + offset); 4474 sbi->s_es = es; 4475 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { 4476 ext4_msg(sb, KERN_ERR, 4477 "Magic mismatch, very weird!"); 4478 goto failed_mount; 4479 } 4480 } 4481 4482 has_huge_files = ext4_has_feature_huge_file(sb); 4483 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 4484 has_huge_files); 4485 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 4486 4487 sbi->s_desc_size = le16_to_cpu(es->s_desc_size); 4488 if (ext4_has_feature_64bit(sb)) { 4489 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || 4490 sbi->s_desc_size > EXT4_MAX_DESC_SIZE || 4491 !is_power_of_2(sbi->s_desc_size)) { 4492 ext4_msg(sb, KERN_ERR, 4493 "unsupported descriptor size %lu", 4494 sbi->s_desc_size); 4495 goto failed_mount; 4496 } 4497 } else 4498 sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 4499 4500 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 4501 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 4502 4503 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); 4504 if (sbi->s_inodes_per_block == 0) 4505 goto cantfind_ext4; 4506 if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || 4507 sbi->s_inodes_per_group > blocksize * 8) { 4508 ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", 4509 sbi->s_inodes_per_group); 4510 goto failed_mount; 4511 } 4512 sbi->s_itb_per_group = sbi->s_inodes_per_group / 4513 sbi->s_inodes_per_block; 4514 sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); 4515 sbi->s_sbh = bh; 4516 sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; 4517 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 4518 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 4519 4520 for (i = 0; i < 4; i++) 4521 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 4522 sbi->s_def_hash_version = es->s_def_hash_version; 4523 if (ext4_has_feature_dir_index(sb)) { 4524 i = le32_to_cpu(es->s_flags); 4525 if (i & EXT2_FLAGS_UNSIGNED_HASH) 4526 sbi->s_hash_unsigned = 3; 4527 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { 4528#ifdef __CHAR_UNSIGNED__ 4529 if (!sb_rdonly(sb)) 4530 es->s_flags |= 4531 cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); 4532 sbi->s_hash_unsigned = 3; 4533#else 4534 if (!sb_rdonly(sb)) 4535 es->s_flags |= 4536 cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 4537#endif 4538 } 4539 } 4540 4541 /* Handle clustersize */ 4542 clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); 4543 if (ext4_has_feature_bigalloc(sb)) { 4544 if (clustersize < blocksize) { 4545 ext4_msg(sb, KERN_ERR, 4546 "cluster size (%d) smaller than " 4547 "block size (%d)", clustersize, blocksize); 4548 goto failed_mount; 4549 } 4550 sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - 4551 le32_to_cpu(es->s_log_block_size); 4552 sbi->s_clusters_per_group = 4553 le32_to_cpu(es->s_clusters_per_group); 4554 if (sbi->s_clusters_per_group > blocksize * 8) { 4555 ext4_msg(sb, KERN_ERR, 4556 "#clusters per group too big: %lu", 4557 sbi->s_clusters_per_group); 4558 goto failed_mount; 4559 } 4560 if (sbi->s_blocks_per_group != 4561 (sbi->s_clusters_per_group * (clustersize / blocksize))) { 4562 ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " 4563 "clusters per group (%lu) inconsistent", 4564 sbi->s_blocks_per_group, 4565 sbi->s_clusters_per_group); 4566 goto failed_mount; 4567 } 4568 } else { 4569 if (clustersize != blocksize) { 4570 ext4_msg(sb, KERN_ERR, 4571 "fragment/cluster size (%d) != " 4572 "block size (%d)", clustersize, blocksize); 4573 goto failed_mount; 4574 } 4575 if (sbi->s_blocks_per_group > blocksize * 8) { 4576 ext4_msg(sb, KERN_ERR, 4577 "#blocks per group too big: %lu", 4578 sbi->s_blocks_per_group); 4579 goto failed_mount; 4580 } 4581 sbi->s_clusters_per_group = sbi->s_blocks_per_group; 4582 sbi->s_cluster_bits = 0; 4583 } 4584 sbi->s_cluster_ratio = clustersize / blocksize; 4585 4586 /* Do we have standard group size of clustersize * 8 blocks ? */ 4587 if (sbi->s_blocks_per_group == clustersize << 3) 4588 set_opt2(sb, STD_GROUP_SIZE); 4589 4590 /* 4591 * Test whether we have more sectors than will fit in sector_t, 4592 * and whether the max offset is addressable by the page cache. 4593 */ 4594 err = generic_check_addressable(sb->s_blocksize_bits, 4595 ext4_blocks_count(es)); 4596 if (err) { 4597 ext4_msg(sb, KERN_ERR, "filesystem" 4598 " too large to mount safely on this system"); 4599 goto failed_mount; 4600 } 4601 4602 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 4603 goto cantfind_ext4; 4604 4605 /* check blocks count against device size */ 4606 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; 4607 if (blocks_count && ext4_blocks_count(es) > blocks_count) { 4608 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " 4609 "exceeds size of device (%llu blocks)", 4610 ext4_blocks_count(es), blocks_count); 4611 goto failed_mount; 4612 } 4613 4614 /* 4615 * It makes no sense for the first data block to be beyond the end 4616 * of the filesystem. 4617 */ 4618 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { 4619 ext4_msg(sb, KERN_WARNING, "bad geometry: first data " 4620 "block %u is beyond end of filesystem (%llu)", 4621 le32_to_cpu(es->s_first_data_block), 4622 ext4_blocks_count(es)); 4623 goto failed_mount; 4624 } 4625 if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && 4626 (sbi->s_cluster_ratio == 1)) { 4627 ext4_msg(sb, KERN_WARNING, "bad geometry: first data " 4628 "block is 0 with a 1k block and cluster size"); 4629 goto failed_mount; 4630 } 4631 4632 blocks_count = (ext4_blocks_count(es) - 4633 le32_to_cpu(es->s_first_data_block) + 4634 EXT4_BLOCKS_PER_GROUP(sb) - 1); 4635 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 4636 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { 4637 ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " 4638 "(block count %llu, first data block %u, " 4639 "blocks per group %lu)", blocks_count, 4640 ext4_blocks_count(es), 4641 le32_to_cpu(es->s_first_data_block), 4642 EXT4_BLOCKS_PER_GROUP(sb)); 4643 goto failed_mount; 4644 } 4645 sbi->s_groups_count = blocks_count; 4646 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, 4647 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); 4648 if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != 4649 le32_to_cpu(es->s_inodes_count)) { 4650 ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", 4651 le32_to_cpu(es->s_inodes_count), 4652 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); 4653 ret = -EINVAL; 4654 goto failed_mount; 4655 } 4656 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 4657 EXT4_DESC_PER_BLOCK(sb); 4658 if (ext4_has_feature_meta_bg(sb)) { 4659 if (le32_to_cpu(es->s_first_meta_bg) > db_count) { 4660 ext4_msg(sb, KERN_WARNING, 4661 "first meta block group too large: %u " 4662 "(group descriptor block count %u)", 4663 le32_to_cpu(es->s_first_meta_bg), db_count); 4664 goto failed_mount; 4665 } 4666 } 4667 rcu_assign_pointer(sbi->s_group_desc, 4668 kvmalloc_array(db_count, 4669 sizeof(struct buffer_head *), 4670 GFP_KERNEL)); 4671 if (sbi->s_group_desc == NULL) { 4672 ext4_msg(sb, KERN_ERR, "not enough memory"); 4673 ret = -ENOMEM; 4674 goto failed_mount; 4675 } 4676 4677 bgl_lock_init(sbi->s_blockgroup_lock); 4678 4679 /* Pre-read the descriptors into the buffer cache */ 4680 for (i = 0; i < db_count; i++) { 4681 block = descriptor_loc(sb, logical_sb_block, i); 4682 ext4_sb_breadahead_unmovable(sb, block); 4683 } 4684 4685 for (i = 0; i < db_count; i++) { 4686 struct buffer_head *bh; 4687 4688 block = descriptor_loc(sb, logical_sb_block, i); 4689 bh = ext4_sb_bread_unmovable(sb, block); 4690 if (IS_ERR(bh)) { 4691 ext4_msg(sb, KERN_ERR, 4692 "can't read group descriptor %d", i); 4693 db_count = i; 4694 ret = PTR_ERR(bh); 4695 bh = NULL; 4696 goto failed_mount2; 4697 } 4698 rcu_read_lock(); 4699 rcu_dereference(sbi->s_group_desc)[i] = bh; 4700 rcu_read_unlock(); 4701 } 4702 sbi->s_gdb_count = db_count; 4703 if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { 4704 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 4705 ret = -EFSCORRUPTED; 4706 goto failed_mount2; 4707 } 4708 4709 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); 4710 spin_lock_init(&sbi->s_error_lock); 4711 INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); 4712 4713 /* Register extent status tree shrinker */ 4714 if (ext4_es_register_shrinker(sbi)) 4715 goto failed_mount3; 4716 4717 sbi->s_stripe = ext4_get_stripe_size(sbi); 4718 sbi->s_extent_max_zeroout_kb = 32; 4719 4720 /* 4721 * set up enough so that it can read an inode 4722 */ 4723 sb->s_op = &ext4_sops; 4724 sb->s_export_op = &ext4_export_ops; 4725 sb->s_xattr = ext4_xattr_handlers; 4726#ifdef CONFIG_FS_ENCRYPTION 4727 sb->s_cop = &ext4_cryptops; 4728#endif 4729#ifdef CONFIG_FS_VERITY 4730 sb->s_vop = &ext4_verityops; 4731#endif 4732#ifdef CONFIG_QUOTA 4733 sb->dq_op = &ext4_quota_operations; 4734 if (ext4_has_feature_quota(sb)) 4735 sb->s_qcop = &dquot_quotactl_sysfile_ops; 4736 else 4737 sb->s_qcop = &ext4_qctl_operations; 4738 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; 4739#endif 4740 memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); 4741 4742 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 4743 mutex_init(&sbi->s_orphan_lock); 4744 4745 /* Initialize fast commit stuff */ 4746 atomic_set(&sbi->s_fc_subtid, 0); 4747 atomic_set(&sbi->s_fc_ineligible_updates, 0); 4748 INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); 4749 INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); 4750 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); 4751 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); 4752 sbi->s_fc_bytes = 0; 4753 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 4754 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 4755 spin_lock_init(&sbi->s_fc_lock); 4756 memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); 4757 sbi->s_fc_replay_state.fc_regions = NULL; 4758 sbi->s_fc_replay_state.fc_regions_size = 0; 4759 sbi->s_fc_replay_state.fc_regions_used = 0; 4760 sbi->s_fc_replay_state.fc_regions_valid = 0; 4761 sbi->s_fc_replay_state.fc_modified_inodes = NULL; 4762 sbi->s_fc_replay_state.fc_modified_inodes_size = 0; 4763 sbi->s_fc_replay_state.fc_modified_inodes_used = 0; 4764 4765 sb->s_root = NULL; 4766 4767 needs_recovery = (es->s_last_orphan != 0 || 4768 ext4_has_feature_journal_needs_recovery(sb)); 4769 4770 if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { 4771 err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); 4772 if (err) 4773 goto failed_mount3a; 4774 } 4775 4776 /* 4777 * The first inode we look at is the journal inode. Don't try 4778 * root first: it may be modified in the journal! 4779 */ 4780 if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { 4781 err = ext4_load_journal(sb, es, journal_devnum); 4782 if (err) 4783 goto failed_mount3a; 4784 } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && 4785 ext4_has_feature_journal_needs_recovery(sb)) { 4786 ext4_msg(sb, KERN_ERR, "required journal recovery " 4787 "suppressed and not mounted read-only"); 4788 goto failed_mount3a; 4789 } else { 4790 /* Nojournal mode, all journal mount options are illegal */ 4791 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 4792 ext4_msg(sb, KERN_ERR, "can't mount with " 4793 "journal_async_commit, fs mounted w/o journal"); 4794 goto failed_mount3a; 4795 } 4796 4797 if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { 4798 ext4_msg(sb, KERN_ERR, "can't mount with " 4799 "journal_checksum, fs mounted w/o journal"); 4800 goto failed_mount3a; 4801 } 4802 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 4803 ext4_msg(sb, KERN_ERR, "can't mount with " 4804 "commit=%lu, fs mounted w/o journal", 4805 sbi->s_commit_interval / HZ); 4806 goto failed_mount3a; 4807 } 4808 if (EXT4_MOUNT_DATA_FLAGS & 4809 (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { 4810 ext4_msg(sb, KERN_ERR, "can't mount with " 4811 "data=, fs mounted w/o journal"); 4812 goto failed_mount3a; 4813 } 4814 sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; 4815 clear_opt(sb, JOURNAL_CHECKSUM); 4816 clear_opt(sb, DATA_FLAGS); 4817 clear_opt2(sb, JOURNAL_FAST_COMMIT); 4818 sbi->s_journal = NULL; 4819 needs_recovery = 0; 4820 goto no_journal; 4821 } 4822 4823 if (ext4_has_feature_64bit(sb) && 4824 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 4825 JBD2_FEATURE_INCOMPAT_64BIT)) { 4826 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 4827 goto failed_mount_wq; 4828 } 4829 4830 if (!set_journal_csum_feature_set(sb)) { 4831 ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " 4832 "feature set"); 4833 goto failed_mount_wq; 4834 } 4835 4836 if (test_opt2(sb, JOURNAL_FAST_COMMIT) && 4837 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 4838 JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { 4839 ext4_msg(sb, KERN_ERR, 4840 "Failed to set fast commit journal feature"); 4841 goto failed_mount_wq; 4842 } 4843 4844 /* We have now updated the journal if required, so we can 4845 * validate the data journaling mode. */ 4846 switch (test_opt(sb, DATA_FLAGS)) { 4847 case 0: 4848 /* No mode set, assume a default based on the journal 4849 * capabilities: ORDERED_DATA if the journal can 4850 * cope, else JOURNAL_DATA 4851 */ 4852 if (jbd2_journal_check_available_features 4853 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 4854 set_opt(sb, ORDERED_DATA); 4855 sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 4856 } else { 4857 set_opt(sb, JOURNAL_DATA); 4858 sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 4859 } 4860 break; 4861 4862 case EXT4_MOUNT_ORDERED_DATA: 4863 case EXT4_MOUNT_WRITEBACK_DATA: 4864 if (!jbd2_journal_check_available_features 4865 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 4866 ext4_msg(sb, KERN_ERR, "Journal does not support " 4867 "requested data journaling mode"); 4868 goto failed_mount_wq; 4869 } 4870 default: 4871 break; 4872 } 4873 4874 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && 4875 test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 4876 ext4_msg(sb, KERN_ERR, "can't mount with " 4877 "journal_async_commit in data=ordered mode"); 4878 goto failed_mount_wq; 4879 } 4880 4881 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4882 4883 sbi->s_journal->j_submit_inode_data_buffers = 4884 ext4_journal_submit_inode_data_buffers; 4885 sbi->s_journal->j_finish_inode_data_buffers = 4886 ext4_journal_finish_inode_data_buffers; 4887 4888no_journal: 4889 if (!test_opt(sb, NO_MBCACHE)) { 4890 sbi->s_ea_block_cache = ext4_xattr_create_cache(); 4891 if (!sbi->s_ea_block_cache) { 4892 ext4_msg(sb, KERN_ERR, 4893 "Failed to create ea_block_cache"); 4894 goto failed_mount_wq; 4895 } 4896 4897 if (ext4_has_feature_ea_inode(sb)) { 4898 sbi->s_ea_inode_cache = ext4_xattr_create_cache(); 4899 if (!sbi->s_ea_inode_cache) { 4900 ext4_msg(sb, KERN_ERR, 4901 "Failed to create ea_inode_cache"); 4902 goto failed_mount_wq; 4903 } 4904 } 4905 } 4906 4907 if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { 4908 ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); 4909 goto failed_mount_wq; 4910 } 4911 4912 /* 4913 * Get the # of file system overhead blocks from the 4914 * superblock if present. 4915 */ 4916 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); 4917 /* ignore the precalculated value if it is ridiculous */ 4918 if (sbi->s_overhead > ext4_blocks_count(es)) 4919 sbi->s_overhead = 0; 4920 /* 4921 * If the bigalloc feature is not enabled recalculating the 4922 * overhead doesn't take long, so we might as well just redo 4923 * it to make sure we are using the correct value. 4924 */ 4925 if (!ext4_has_feature_bigalloc(sb)) 4926 sbi->s_overhead = 0; 4927 if (sbi->s_overhead == 0) { 4928 err = ext4_calculate_overhead(sb); 4929 if (err) 4930 goto failed_mount_wq; 4931 } 4932 4933 /* 4934 * The maximum number of concurrent works can be high and 4935 * concurrency isn't really necessary. Limit it to 1. 4936 */ 4937 EXT4_SB(sb)->rsv_conversion_wq = 4938 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 4939 if (!EXT4_SB(sb)->rsv_conversion_wq) { 4940 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); 4941 ret = -ENOMEM; 4942 goto failed_mount4; 4943 } 4944 4945 /* 4946 * The jbd2_journal_load will have done any necessary log recovery, 4947 * so we can safely mount the rest of the filesystem now. 4948 */ 4949 4950 root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); 4951 if (IS_ERR(root)) { 4952 ext4_msg(sb, KERN_ERR, "get root inode failed"); 4953 ret = PTR_ERR(root); 4954 root = NULL; 4955 goto failed_mount4; 4956 } 4957 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 4958 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 4959 iput(root); 4960 goto failed_mount4; 4961 } 4962 4963#ifdef CONFIG_UNICODE 4964 if (sb->s_encoding) 4965 sb->s_d_op = &ext4_dentry_ops; 4966#endif 4967 4968 sb->s_root = d_make_root(root); 4969 if (!sb->s_root) { 4970 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 4971 ret = -ENOMEM; 4972 goto failed_mount4; 4973 } 4974 4975 ret = ext4_setup_super(sb, es, sb_rdonly(sb)); 4976 if (ret == -EROFS) { 4977 sb->s_flags |= SB_RDONLY; 4978 ret = 0; 4979 } else if (ret) 4980 goto failed_mount4a; 4981 4982 ext4_set_resv_clusters(sb); 4983 4984 if (test_opt(sb, BLOCK_VALIDITY)) { 4985 err = ext4_setup_system_zone(sb); 4986 if (err) { 4987 ext4_msg(sb, KERN_ERR, "failed to initialize system " 4988 "zone (%d)", err); 4989 goto failed_mount4a; 4990 } 4991 } 4992 ext4_fc_replay_cleanup(sb); 4993 4994 ext4_ext_init(sb); 4995 err = ext4_mb_init(sb); 4996 if (err) { 4997 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", 4998 err); 4999 goto failed_mount5; 5000 } 5001 5002 /* 5003 * We can only set up the journal commit callback once 5004 * mballoc is initialized 5005 */ 5006 if (sbi->s_journal) 5007 sbi->s_journal->j_commit_callback = 5008 ext4_journal_commit_callback; 5009 5010 block = ext4_count_free_clusters(sb); 5011 ext4_free_blocks_count_set(sbi->s_es, 5012 EXT4_C2B(sbi, block)); 5013 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 5014 GFP_KERNEL); 5015 if (!err) { 5016 unsigned long freei = ext4_count_free_inodes(sb); 5017 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 5018 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 5019 GFP_KERNEL); 5020 } 5021 /* 5022 * Update the checksum after updating free space/inode 5023 * counters. Otherwise the superblock can have an incorrect 5024 * checksum in the buffer cache until it is written out and 5025 * e2fsprogs programs trying to open a file system immediately 5026 * after it is mounted can fail. 5027 */ 5028 ext4_superblock_csum_set(sb); 5029 if (!err) 5030 err = percpu_counter_init(&sbi->s_dirs_counter, 5031 ext4_count_dirs(sb), GFP_KERNEL); 5032 if (!err) 5033 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 5034 GFP_KERNEL); 5035 if (!err) 5036 err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, 5037 GFP_KERNEL); 5038 if (!err) 5039 err = percpu_init_rwsem(&sbi->s_writepages_rwsem); 5040 5041 if (err) { 5042 ext4_msg(sb, KERN_ERR, "insufficient memory"); 5043 goto failed_mount6; 5044 } 5045 5046 if (ext4_has_feature_flex_bg(sb)) 5047 if (!ext4_fill_flex_info(sb)) { 5048 ext4_msg(sb, KERN_ERR, 5049 "unable to initialize " 5050 "flex_bg meta info!"); 5051 ret = -ENOMEM; 5052 goto failed_mount6; 5053 } 5054 5055 err = ext4_register_li_request(sb, first_not_zeroed); 5056 if (err) 5057 goto failed_mount6; 5058 5059 err = ext4_register_sysfs(sb); 5060 if (err) 5061 goto failed_mount7; 5062 5063#ifdef CONFIG_QUOTA 5064 /* Enable quota usage during mount. */ 5065 if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { 5066 err = ext4_enable_quotas(sb); 5067 if (err) 5068 goto failed_mount8; 5069 } 5070#endif /* CONFIG_QUOTA */ 5071 5072 /* 5073 * Save the original bdev mapping's wb_err value which could be 5074 * used to detect the metadata async write error. 5075 */ 5076 spin_lock_init(&sbi->s_bdev_wb_lock); 5077 errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, 5078 &sbi->s_bdev_wb_err); 5079 sb->s_bdev->bd_super = sb; 5080 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 5081 ext4_orphan_cleanup(sb, es); 5082 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 5083 if (needs_recovery) { 5084 ext4_msg(sb, KERN_INFO, "recovery complete"); 5085 err = ext4_mark_recovery_complete(sb, es); 5086 if (err) 5087 goto failed_mount8; 5088 } 5089 if (EXT4_SB(sb)->s_journal) { 5090 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 5091 descr = " journalled data mode"; 5092 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 5093 descr = " ordered data mode"; 5094 else 5095 descr = " writeback data mode"; 5096 } else 5097 descr = "out journal"; 5098 5099 if (test_opt(sb, DISCARD)) { 5100 struct request_queue *q = bdev_get_queue(sb->s_bdev); 5101 if (!blk_queue_discard(q)) 5102 ext4_msg(sb, KERN_WARNING, 5103 "mounting with \"discard\" option, but " 5104 "the device does not support discard"); 5105 } 5106 5107 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) 5108 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 5109 "Opts: %.*s%s%s", descr, 5110 (int) sizeof(sbi->s_es->s_mount_opts), 5111 sbi->s_es->s_mount_opts, 5112 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 5113 5114 if (es->s_error_count) 5115 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 5116 5117 /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ 5118 ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); 5119 ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); 5120 ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); 5121 atomic_set(&sbi->s_warning_count, 0); 5122 atomic_set(&sbi->s_msg_count, 0); 5123 5124 kfree(orig_data); 5125 return 0; 5126 5127cantfind_ext4: 5128 if (!silent) 5129 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 5130 goto failed_mount; 5131 5132failed_mount8: 5133 ext4_unregister_sysfs(sb); 5134 kobject_put(&sbi->s_kobj); 5135failed_mount7: 5136 ext4_unregister_li_request(sb); 5137failed_mount6: 5138 ext4_mb_release(sb); 5139 rcu_read_lock(); 5140 flex_groups = rcu_dereference(sbi->s_flex_groups); 5141 if (flex_groups) { 5142 for (i = 0; i < sbi->s_flex_groups_allocated; i++) 5143 kvfree(flex_groups[i]); 5144 kvfree(flex_groups); 5145 } 5146 rcu_read_unlock(); 5147 percpu_counter_destroy(&sbi->s_freeclusters_counter); 5148 percpu_counter_destroy(&sbi->s_freeinodes_counter); 5149 percpu_counter_destroy(&sbi->s_dirs_counter); 5150 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 5151 percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 5152 percpu_free_rwsem(&sbi->s_writepages_rwsem); 5153failed_mount5: 5154 ext4_ext_release(sb); 5155 ext4_release_system_zone(sb); 5156failed_mount4a: 5157 dput(sb->s_root); 5158 sb->s_root = NULL; 5159failed_mount4: 5160 ext4_msg(sb, KERN_ERR, "mount failed"); 5161 if (EXT4_SB(sb)->rsv_conversion_wq) 5162 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 5163failed_mount_wq: 5164 ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); 5165 sbi->s_ea_inode_cache = NULL; 5166 5167 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 5168 sbi->s_ea_block_cache = NULL; 5169 5170 if (sbi->s_journal) { 5171 /* flush s_error_work before journal destroy. */ 5172 flush_work(&sbi->s_error_work); 5173 jbd2_journal_destroy(sbi->s_journal); 5174 sbi->s_journal = NULL; 5175 } 5176failed_mount3a: 5177 ext4_es_unregister_shrinker(sbi); 5178failed_mount3: 5179 /* flush s_error_work before sbi destroy */ 5180 flush_work(&sbi->s_error_work); 5181 del_timer_sync(&sbi->s_err_report); 5182 ext4_stop_mmpd(sbi); 5183failed_mount2: 5184 rcu_read_lock(); 5185 group_desc = rcu_dereference(sbi->s_group_desc); 5186 for (i = 0; i < db_count; i++) 5187 brelse(group_desc[i]); 5188 kvfree(group_desc); 5189 rcu_read_unlock(); 5190failed_mount: 5191 if (sbi->s_chksum_driver) 5192 crypto_free_shash(sbi->s_chksum_driver); 5193 5194#ifdef CONFIG_UNICODE 5195 utf8_unload(sb->s_encoding); 5196#endif 5197 5198#ifdef CONFIG_QUOTA 5199 for (i = 0; i < EXT4_MAXQUOTAS; i++) 5200 kfree(get_qf_name(sb, sbi, i)); 5201#endif 5202 fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); 5203 /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ 5204 brelse(bh); 5205 ext4_blkdev_remove(sbi); 5206out_fail: 5207 invalidate_bdev(sb->s_bdev); 5208 sb->s_fs_info = NULL; 5209 kfree(sbi->s_blockgroup_lock); 5210out_free_base: 5211 kfree(sbi); 5212 kfree(orig_data); 5213 fs_put_dax(dax_dev); 5214 return err ? err : ret; 5215} 5216 5217/* 5218 * Setup any per-fs journal parameters now. We'll do this both on 5219 * initial mount, once the journal has been initialised but before we've 5220 * done any recovery; and again on any subsequent remount. 5221 */ 5222static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) 5223{ 5224 struct ext4_sb_info *sbi = EXT4_SB(sb); 5225 5226 journal->j_commit_interval = sbi->s_commit_interval; 5227 journal->j_min_batch_time = sbi->s_min_batch_time; 5228 journal->j_max_batch_time = sbi->s_max_batch_time; 5229 ext4_fc_init(sb, journal); 5230 5231 write_lock(&journal->j_state_lock); 5232 if (test_opt(sb, BARRIER)) 5233 journal->j_flags |= JBD2_BARRIER; 5234 else 5235 journal->j_flags &= ~JBD2_BARRIER; 5236 if (test_opt(sb, DATA_ERR_ABORT)) 5237 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 5238 else 5239 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 5240 write_unlock(&journal->j_state_lock); 5241} 5242 5243static struct inode *ext4_get_journal_inode(struct super_block *sb, 5244 unsigned int journal_inum) 5245{ 5246 struct inode *journal_inode; 5247 5248 /* 5249 * Test for the existence of a valid inode on disk. Bad things 5250 * happen if we iget() an unused inode, as the subsequent iput() 5251 * will try to delete it. 5252 */ 5253 journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); 5254 if (IS_ERR(journal_inode)) { 5255 ext4_msg(sb, KERN_ERR, "no journal found"); 5256 return NULL; 5257 } 5258 if (!journal_inode->i_nlink) { 5259 make_bad_inode(journal_inode); 5260 iput(journal_inode); 5261 ext4_msg(sb, KERN_ERR, "journal inode is deleted"); 5262 return NULL; 5263 } 5264 5265 jbd_debug(2, "Journal inode found at %p: %lld bytes\n", 5266 journal_inode, journal_inode->i_size); 5267 if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { 5268 ext4_msg(sb, KERN_ERR, "invalid journal inode"); 5269 iput(journal_inode); 5270 return NULL; 5271 } 5272 return journal_inode; 5273} 5274 5275static journal_t *ext4_get_journal(struct super_block *sb, 5276 unsigned int journal_inum) 5277{ 5278 struct inode *journal_inode; 5279 journal_t *journal; 5280 5281 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) 5282 return NULL; 5283 5284 journal_inode = ext4_get_journal_inode(sb, journal_inum); 5285 if (!journal_inode) 5286 return NULL; 5287 5288 journal = jbd2_journal_init_inode(journal_inode); 5289 if (!journal) { 5290 ext4_msg(sb, KERN_ERR, "Could not load journal inode"); 5291 iput(journal_inode); 5292 return NULL; 5293 } 5294 journal->j_private = sb; 5295 ext4_init_journal_params(sb, journal); 5296 return journal; 5297} 5298 5299static journal_t *ext4_get_dev_journal(struct super_block *sb, 5300 dev_t j_dev) 5301{ 5302 struct buffer_head *bh; 5303 journal_t *journal; 5304 ext4_fsblk_t start; 5305 ext4_fsblk_t len; 5306 int hblock, blocksize; 5307 ext4_fsblk_t sb_block; 5308 unsigned long offset; 5309 struct ext4_super_block *es; 5310 struct block_device *bdev; 5311 5312 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) 5313 return NULL; 5314 5315 bdev = ext4_blkdev_get(j_dev, sb); 5316 if (bdev == NULL) 5317 return NULL; 5318 5319 blocksize = sb->s_blocksize; 5320 hblock = bdev_logical_block_size(bdev); 5321 if (blocksize < hblock) { 5322 ext4_msg(sb, KERN_ERR, 5323 "blocksize too small for journal device"); 5324 goto out_bdev; 5325 } 5326 5327 sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; 5328 offset = EXT4_MIN_BLOCK_SIZE % blocksize; 5329 set_blocksize(bdev, blocksize); 5330 if (!(bh = __bread(bdev, sb_block, blocksize))) { 5331 ext4_msg(sb, KERN_ERR, "couldn't read superblock of " 5332 "external journal"); 5333 goto out_bdev; 5334 } 5335 5336 es = (struct ext4_super_block *) (bh->b_data + offset); 5337 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || 5338 !(le32_to_cpu(es->s_feature_incompat) & 5339 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { 5340 ext4_msg(sb, KERN_ERR, "external journal has " 5341 "bad superblock"); 5342 brelse(bh); 5343 goto out_bdev; 5344 } 5345 5346 if ((le32_to_cpu(es->s_feature_ro_compat) & 5347 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && 5348 es->s_checksum != ext4_superblock_csum(sb, es)) { 5349 ext4_msg(sb, KERN_ERR, "external journal has " 5350 "corrupt superblock"); 5351 brelse(bh); 5352 goto out_bdev; 5353 } 5354 5355 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { 5356 ext4_msg(sb, KERN_ERR, "journal UUID does not match"); 5357 brelse(bh); 5358 goto out_bdev; 5359 } 5360 5361 len = ext4_blocks_count(es); 5362 start = sb_block + 1; 5363 brelse(bh); /* we're done with the superblock */ 5364 5365 journal = jbd2_journal_init_dev(bdev, sb->s_bdev, 5366 start, len, blocksize); 5367 if (!journal) { 5368 ext4_msg(sb, KERN_ERR, "failed to create device journal"); 5369 goto out_bdev; 5370 } 5371 journal->j_private = sb; 5372 if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { 5373 ext4_msg(sb, KERN_ERR, "I/O error on journal device"); 5374 goto out_journal; 5375 } 5376 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { 5377 ext4_msg(sb, KERN_ERR, "External journal has more than one " 5378 "user (unsupported) - %d", 5379 be32_to_cpu(journal->j_superblock->s_nr_users)); 5380 goto out_journal; 5381 } 5382 EXT4_SB(sb)->s_journal_bdev = bdev; 5383 ext4_init_journal_params(sb, journal); 5384 return journal; 5385 5386out_journal: 5387 jbd2_journal_destroy(journal); 5388out_bdev: 5389 ext4_blkdev_put(bdev); 5390 return NULL; 5391} 5392 5393static int ext4_load_journal(struct super_block *sb, 5394 struct ext4_super_block *es, 5395 unsigned long journal_devnum) 5396{ 5397 journal_t *journal; 5398 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); 5399 dev_t journal_dev; 5400 int err = 0; 5401 int really_read_only; 5402 int journal_dev_ro; 5403 5404 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) 5405 return -EFSCORRUPTED; 5406 5407 if (journal_devnum && 5408 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 5409 ext4_msg(sb, KERN_INFO, "external journal device major/minor " 5410 "numbers have changed"); 5411 journal_dev = new_decode_dev(journal_devnum); 5412 } else 5413 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); 5414 5415 if (journal_inum && journal_dev) { 5416 ext4_msg(sb, KERN_ERR, 5417 "filesystem has both journal inode and journal device!"); 5418 return -EINVAL; 5419 } 5420 5421 if (journal_inum) { 5422 journal = ext4_get_journal(sb, journal_inum); 5423 if (!journal) 5424 return -EINVAL; 5425 } else { 5426 journal = ext4_get_dev_journal(sb, journal_dev); 5427 if (!journal) 5428 return -EINVAL; 5429 } 5430 5431 journal_dev_ro = bdev_read_only(journal->j_dev); 5432 really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; 5433 5434 if (journal_dev_ro && !sb_rdonly(sb)) { 5435 ext4_msg(sb, KERN_ERR, 5436 "journal device read-only, try mounting with '-o ro'"); 5437 err = -EROFS; 5438 goto err_out; 5439 } 5440 5441 /* 5442 * Are we loading a blank journal or performing recovery after a 5443 * crash? For recovery, we need to check in advance whether we 5444 * can get read-write access to the device. 5445 */ 5446 if (ext4_has_feature_journal_needs_recovery(sb)) { 5447 if (sb_rdonly(sb)) { 5448 ext4_msg(sb, KERN_INFO, "INFO: recovery " 5449 "required on readonly filesystem"); 5450 if (really_read_only) { 5451 ext4_msg(sb, KERN_ERR, "write access " 5452 "unavailable, cannot proceed " 5453 "(try mounting with noload)"); 5454 err = -EROFS; 5455 goto err_out; 5456 } 5457 ext4_msg(sb, KERN_INFO, "write access will " 5458 "be enabled during recovery"); 5459 } 5460 } 5461 5462 if (!(journal->j_flags & JBD2_BARRIER)) 5463 ext4_msg(sb, KERN_INFO, "barriers disabled"); 5464 5465 if (!ext4_has_feature_journal_needs_recovery(sb)) 5466 err = jbd2_journal_wipe(journal, !really_read_only); 5467 if (!err) { 5468 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); 5469 if (save) 5470 memcpy(save, ((char *) es) + 5471 EXT4_S_ERR_START, EXT4_S_ERR_LEN); 5472 err = jbd2_journal_load(journal); 5473 if (save) 5474 memcpy(((char *) es) + EXT4_S_ERR_START, 5475 save, EXT4_S_ERR_LEN); 5476 kfree(save); 5477 } 5478 5479 if (err) { 5480 ext4_msg(sb, KERN_ERR, "error loading journal"); 5481 goto err_out; 5482 } 5483 5484 EXT4_SB(sb)->s_journal = journal; 5485 err = ext4_clear_journal_err(sb, es); 5486 if (err) { 5487 EXT4_SB(sb)->s_journal = NULL; 5488 jbd2_journal_destroy(journal); 5489 return err; 5490 } 5491 5492 if (!really_read_only && journal_devnum && 5493 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 5494 es->s_journal_dev = cpu_to_le32(journal_devnum); 5495 5496 /* Make sure we flush the recovery flag to disk. */ 5497 ext4_commit_super(sb); 5498 } 5499 5500 return 0; 5501 5502err_out: 5503 jbd2_journal_destroy(journal); 5504 return err; 5505} 5506 5507/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ 5508static void ext4_update_super(struct super_block *sb) 5509{ 5510 struct ext4_sb_info *sbi = EXT4_SB(sb); 5511 struct ext4_super_block *es = sbi->s_es; 5512 struct buffer_head *sbh = sbi->s_sbh; 5513 5514 lock_buffer(sbh); 5515 /* 5516 * If the file system is mounted read-only, don't update the 5517 * superblock write time. This avoids updating the superblock 5518 * write time when we are mounting the root file system 5519 * read/only but we need to replay the journal; at that point, 5520 * for people who are east of GMT and who make their clock 5521 * tick in localtime for Windows bug-for-bug compatibility, 5522 * the clock is set in the future, and this will cause e2fsck 5523 * to complain and force a full file system check. 5524 */ 5525 if (!(sb->s_flags & SB_RDONLY)) 5526 ext4_update_tstamp(es, s_wtime); 5527 if (sb->s_bdev->bd_part) 5528 es->s_kbytes_written = 5529 cpu_to_le64(sbi->s_kbytes_written + 5530 ((part_stat_read(sb->s_bdev->bd_part, 5531 sectors[STAT_WRITE]) - 5532 sbi->s_sectors_written_start) >> 1)); 5533 else 5534 es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written); 5535 if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) 5536 ext4_free_blocks_count_set(es, 5537 EXT4_C2B(sbi, percpu_counter_sum_positive( 5538 &sbi->s_freeclusters_counter))); 5539 if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) 5540 es->s_free_inodes_count = 5541 cpu_to_le32(percpu_counter_sum_positive( 5542 &sbi->s_freeinodes_counter)); 5543 /* Copy error information to the on-disk superblock */ 5544 spin_lock(&sbi->s_error_lock); 5545 if (sbi->s_add_error_count > 0) { 5546 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 5547 if (!es->s_first_error_time && !es->s_first_error_time_hi) { 5548 __ext4_update_tstamp(&es->s_first_error_time, 5549 &es->s_first_error_time_hi, 5550 sbi->s_first_error_time); 5551 strncpy(es->s_first_error_func, sbi->s_first_error_func, 5552 sizeof(es->s_first_error_func)); 5553 es->s_first_error_line = 5554 cpu_to_le32(sbi->s_first_error_line); 5555 es->s_first_error_ino = 5556 cpu_to_le32(sbi->s_first_error_ino); 5557 es->s_first_error_block = 5558 cpu_to_le64(sbi->s_first_error_block); 5559 es->s_first_error_errcode = 5560 ext4_errno_to_code(sbi->s_first_error_code); 5561 } 5562 __ext4_update_tstamp(&es->s_last_error_time, 5563 &es->s_last_error_time_hi, 5564 sbi->s_last_error_time); 5565 strncpy(es->s_last_error_func, sbi->s_last_error_func, 5566 sizeof(es->s_last_error_func)); 5567 es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); 5568 es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); 5569 es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); 5570 es->s_last_error_errcode = 5571 ext4_errno_to_code(sbi->s_last_error_code); 5572 /* 5573 * Start the daily error reporting function if it hasn't been 5574 * started already 5575 */ 5576 if (!es->s_error_count) 5577 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); 5578 le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); 5579 sbi->s_add_error_count = 0; 5580 } 5581 spin_unlock(&sbi->s_error_lock); 5582 5583 ext4_superblock_csum_set(sb); 5584 unlock_buffer(sbh); 5585} 5586 5587static int ext4_commit_super(struct super_block *sb) 5588{ 5589 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5590 int error = 0; 5591 5592 if (!sbh) 5593 return -EINVAL; 5594 if (block_device_ejected(sb)) 5595 return -ENODEV; 5596 5597 ext4_update_super(sb); 5598 5599 if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { 5600 /* 5601 * Oh, dear. A previous attempt to write the 5602 * superblock failed. This could happen because the 5603 * USB device was yanked out. Or it could happen to 5604 * be a transient write error and maybe the block will 5605 * be remapped. Nothing we can do but to retry the 5606 * write and hope for the best. 5607 */ 5608 ext4_msg(sb, KERN_ERR, "previous I/O error to " 5609 "superblock detected"); 5610 clear_buffer_write_io_error(sbh); 5611 set_buffer_uptodate(sbh); 5612 } 5613 BUFFER_TRACE(sbh, "marking dirty"); 5614 mark_buffer_dirty(sbh); 5615 error = __sync_dirty_buffer(sbh, 5616 REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); 5617 if (buffer_write_io_error(sbh)) { 5618 ext4_msg(sb, KERN_ERR, "I/O error while writing " 5619 "superblock"); 5620 clear_buffer_write_io_error(sbh); 5621 set_buffer_uptodate(sbh); 5622 } 5623 return error; 5624} 5625 5626/* 5627 * Have we just finished recovery? If so, and if we are mounting (or 5628 * remounting) the filesystem readonly, then we will end up with a 5629 * consistent fs on disk. Record that fact. 5630 */ 5631static int ext4_mark_recovery_complete(struct super_block *sb, 5632 struct ext4_super_block *es) 5633{ 5634 int err; 5635 journal_t *journal = EXT4_SB(sb)->s_journal; 5636 5637 if (!ext4_has_feature_journal(sb)) { 5638 if (journal != NULL) { 5639 ext4_error(sb, "Journal got removed while the fs was " 5640 "mounted!"); 5641 return -EFSCORRUPTED; 5642 } 5643 return 0; 5644 } 5645 jbd2_journal_lock_updates(journal); 5646 err = jbd2_journal_flush(journal); 5647 if (err < 0) 5648 goto out; 5649 5650 if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { 5651 ext4_clear_feature_journal_needs_recovery(sb); 5652 ext4_commit_super(sb); 5653 } 5654out: 5655 jbd2_journal_unlock_updates(journal); 5656 return err; 5657} 5658 5659/* 5660 * If we are mounting (or read-write remounting) a filesystem whose journal 5661 * has recorded an error from a previous lifetime, move that error to the 5662 * main filesystem now. 5663 */ 5664static int ext4_clear_journal_err(struct super_block *sb, 5665 struct ext4_super_block *es) 5666{ 5667 journal_t *journal; 5668 int j_errno; 5669 const char *errstr; 5670 5671 if (!ext4_has_feature_journal(sb)) { 5672 ext4_error(sb, "Journal got removed while the fs was mounted!"); 5673 return -EFSCORRUPTED; 5674 } 5675 5676 journal = EXT4_SB(sb)->s_journal; 5677 5678 /* 5679 * Now check for any error status which may have been recorded in the 5680 * journal by a prior ext4_error() or ext4_abort() 5681 */ 5682 5683 j_errno = jbd2_journal_errno(journal); 5684 if (j_errno) { 5685 char nbuf[16]; 5686 5687 errstr = ext4_decode_error(sb, j_errno, nbuf); 5688 ext4_warning(sb, "Filesystem error recorded " 5689 "from previous mount: %s", errstr); 5690 ext4_warning(sb, "Marking fs in need of filesystem check."); 5691 5692 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 5693 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 5694 ext4_commit_super(sb); 5695 5696 jbd2_journal_clear_err(journal); 5697 jbd2_journal_update_sb_errno(journal); 5698 } 5699 return 0; 5700} 5701 5702/* 5703 * Force the running and committing transactions to commit, 5704 * and wait on the commit. 5705 */ 5706int ext4_force_commit(struct super_block *sb) 5707{ 5708 journal_t *journal; 5709 5710 if (sb_rdonly(sb)) 5711 return 0; 5712 5713 journal = EXT4_SB(sb)->s_journal; 5714 return ext4_journal_force_commit(journal); 5715} 5716 5717static int ext4_sync_fs(struct super_block *sb, int wait) 5718{ 5719 int ret = 0; 5720 tid_t target; 5721 bool needs_barrier = false; 5722 struct ext4_sb_info *sbi = EXT4_SB(sb); 5723 5724 if (unlikely(ext4_forced_shutdown(sbi))) 5725 return 0; 5726 5727 trace_ext4_sync_fs(sb, wait); 5728 flush_workqueue(sbi->rsv_conversion_wq); 5729 /* 5730 * Writeback quota in non-journalled quota case - journalled quota has 5731 * no dirty dquots 5732 */ 5733 dquot_writeback_dquots(sb, -1); 5734 /* 5735 * Data writeback is possible w/o journal transaction, so barrier must 5736 * being sent at the end of the function. But we can skip it if 5737 * transaction_commit will do it for us. 5738 */ 5739 if (sbi->s_journal) { 5740 target = jbd2_get_latest_transaction(sbi->s_journal); 5741 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && 5742 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) 5743 needs_barrier = true; 5744 5745 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 5746 if (wait) 5747 ret = jbd2_log_wait_commit(sbi->s_journal, 5748 target); 5749 } 5750 } else if (wait && test_opt(sb, BARRIER)) 5751 needs_barrier = true; 5752 if (needs_barrier) { 5753 int err; 5754 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); 5755 if (!ret) 5756 ret = err; 5757 } 5758 5759 return ret; 5760} 5761 5762/* 5763 * LVM calls this function before a (read-only) snapshot is created. This 5764 * gives us a chance to flush the journal completely and mark the fs clean. 5765 * 5766 * Note that only this function cannot bring a filesystem to be in a clean 5767 * state independently. It relies on upper layer to stop all data & metadata 5768 * modifications. 5769 */ 5770static int ext4_freeze(struct super_block *sb) 5771{ 5772 int error = 0; 5773 journal_t *journal; 5774 5775 if (sb_rdonly(sb)) 5776 return 0; 5777 5778 journal = EXT4_SB(sb)->s_journal; 5779 5780 if (journal) { 5781 /* Now we set up the journal barrier. */ 5782 jbd2_journal_lock_updates(journal); 5783 5784 /* 5785 * Don't clear the needs_recovery flag if we failed to 5786 * flush the journal. 5787 */ 5788 error = jbd2_journal_flush(journal); 5789 if (error < 0) 5790 goto out; 5791 5792 /* Journal blocked and flushed, clear needs_recovery flag. */ 5793 ext4_clear_feature_journal_needs_recovery(sb); 5794 } 5795 5796 error = ext4_commit_super(sb); 5797out: 5798 if (journal) 5799 /* we rely on upper layer to stop further updates */ 5800 jbd2_journal_unlock_updates(journal); 5801 return error; 5802} 5803 5804/* 5805 * Called by LVM after the snapshot is done. We need to reset the RECOVER 5806 * flag here, even though the filesystem is not technically dirty yet. 5807 */ 5808static int ext4_unfreeze(struct super_block *sb) 5809{ 5810 if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) 5811 return 0; 5812 5813 if (EXT4_SB(sb)->s_journal) { 5814 /* Reset the needs_recovery flag before the fs is unlocked. */ 5815 ext4_set_feature_journal_needs_recovery(sb); 5816 } 5817 5818 ext4_commit_super(sb); 5819 return 0; 5820} 5821 5822/* 5823 * Structure to save mount options for ext4_remount's benefit 5824 */ 5825struct ext4_mount_options { 5826 unsigned long s_mount_opt; 5827 unsigned long s_mount_opt2; 5828 kuid_t s_resuid; 5829 kgid_t s_resgid; 5830 unsigned long s_commit_interval; 5831 u32 s_min_batch_time, s_max_batch_time; 5832#ifdef CONFIG_QUOTA 5833 int s_jquota_fmt; 5834 char *s_qf_names[EXT4_MAXQUOTAS]; 5835#endif 5836}; 5837 5838static int ext4_remount(struct super_block *sb, int *flags, char *data) 5839{ 5840 struct ext4_super_block *es; 5841 struct ext4_sb_info *sbi = EXT4_SB(sb); 5842 unsigned long old_sb_flags, vfs_flags; 5843 struct ext4_mount_options old_opts; 5844 ext4_group_t g; 5845 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 5846 int err = 0; 5847#ifdef CONFIG_QUOTA 5848 int enable_quota = 0; 5849 int i, j; 5850 char *to_free[EXT4_MAXQUOTAS]; 5851#endif 5852 char *orig_data = kstrdup(data, GFP_KERNEL); 5853 5854 if (data && !orig_data) 5855 return -ENOMEM; 5856 5857 /* Store the original options */ 5858 old_sb_flags = sb->s_flags; 5859 old_opts.s_mount_opt = sbi->s_mount_opt; 5860 old_opts.s_mount_opt2 = sbi->s_mount_opt2; 5861 old_opts.s_resuid = sbi->s_resuid; 5862 old_opts.s_resgid = sbi->s_resgid; 5863 old_opts.s_commit_interval = sbi->s_commit_interval; 5864 old_opts.s_min_batch_time = sbi->s_min_batch_time; 5865 old_opts.s_max_batch_time = sbi->s_max_batch_time; 5866#ifdef CONFIG_QUOTA 5867 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 5868 for (i = 0; i < EXT4_MAXQUOTAS; i++) 5869 if (sbi->s_qf_names[i]) { 5870 char *qf_name = get_qf_name(sb, sbi, i); 5871 5872 old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); 5873 if (!old_opts.s_qf_names[i]) { 5874 for (j = 0; j < i; j++) 5875 kfree(old_opts.s_qf_names[j]); 5876 kfree(orig_data); 5877 return -ENOMEM; 5878 } 5879 } else 5880 old_opts.s_qf_names[i] = NULL; 5881#endif 5882 if (sbi->s_journal && sbi->s_journal->j_task->io_context) 5883 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; 5884 5885 /* 5886 * Some options can be enabled by ext4 and/or by VFS mount flag 5887 * either way we need to make sure it matches in both *flags and 5888 * s_flags. Copy those selected flags from *flags to s_flags 5889 */ 5890 vfs_flags = SB_LAZYTIME | SB_I_VERSION; 5891 sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); 5892 5893 if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { 5894 err = -EINVAL; 5895 goto restore_opts; 5896 } 5897 5898 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ 5899 test_opt(sb, JOURNAL_CHECKSUM)) { 5900 ext4_msg(sb, KERN_ERR, "changing journal_checksum " 5901 "during remount not supported; ignoring"); 5902 sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; 5903 } 5904 5905 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 5906 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 5907 ext4_msg(sb, KERN_ERR, "can't mount with " 5908 "both data=journal and delalloc"); 5909 err = -EINVAL; 5910 goto restore_opts; 5911 } 5912 if (test_opt(sb, DIOREAD_NOLOCK)) { 5913 ext4_msg(sb, KERN_ERR, "can't mount with " 5914 "both data=journal and dioread_nolock"); 5915 err = -EINVAL; 5916 goto restore_opts; 5917 } 5918 } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { 5919 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 5920 ext4_msg(sb, KERN_ERR, "can't mount with " 5921 "journal_async_commit in data=ordered mode"); 5922 err = -EINVAL; 5923 goto restore_opts; 5924 } 5925 } 5926 5927 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { 5928 ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); 5929 err = -EINVAL; 5930 goto restore_opts; 5931 } 5932 5933 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5934 ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); 5935 5936 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | 5937 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); 5938 5939 es = sbi->s_es; 5940 5941 if (sbi->s_journal) { 5942 ext4_init_journal_params(sb, sbi->s_journal); 5943 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 5944 } 5945 5946 /* Flush outstanding errors before changing fs state */ 5947 flush_work(&sbi->s_error_work); 5948 5949 if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { 5950 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { 5951 err = -EROFS; 5952 goto restore_opts; 5953 } 5954 5955 if (*flags & SB_RDONLY) { 5956 err = sync_filesystem(sb); 5957 if (err < 0) 5958 goto restore_opts; 5959 err = dquot_suspend(sb, -1); 5960 if (err < 0) 5961 goto restore_opts; 5962 5963 /* 5964 * First of all, the unconditional stuff we have to do 5965 * to disable replay of the journal when we next remount 5966 */ 5967 sb->s_flags |= SB_RDONLY; 5968 5969 /* 5970 * OK, test if we are remounting a valid rw partition 5971 * readonly, and if so set the rdonly flag and then 5972 * mark the partition as valid again. 5973 */ 5974 if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && 5975 (sbi->s_mount_state & EXT4_VALID_FS)) 5976 es->s_state = cpu_to_le16(sbi->s_mount_state); 5977 5978 if (sbi->s_journal) { 5979 /* 5980 * We let remount-ro finish even if marking fs 5981 * as clean failed... 5982 */ 5983 ext4_mark_recovery_complete(sb, es); 5984 } 5985 } else { 5986 /* Make sure we can mount this feature set readwrite */ 5987 if (ext4_has_feature_readonly(sb) || 5988 !ext4_feature_set_ok(sb, 0)) { 5989 err = -EROFS; 5990 goto restore_opts; 5991 } 5992 /* 5993 * Make sure the group descriptor checksums 5994 * are sane. If they aren't, refuse to remount r/w. 5995 */ 5996 for (g = 0; g < sbi->s_groups_count; g++) { 5997 struct ext4_group_desc *gdp = 5998 ext4_get_group_desc(sb, g, NULL); 5999 6000 if (!ext4_group_desc_csum_verify(sb, g, gdp)) { 6001 ext4_msg(sb, KERN_ERR, 6002 "ext4_remount: Checksum for group %u failed (%u!=%u)", 6003 g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), 6004 le16_to_cpu(gdp->bg_checksum)); 6005 err = -EFSBADCRC; 6006 goto restore_opts; 6007 } 6008 } 6009 6010 /* 6011 * If we have an unprocessed orphan list hanging 6012 * around from a previously readonly bdev mount, 6013 * require a full umount/remount for now. 6014 */ 6015 if (es->s_last_orphan) { 6016 ext4_msg(sb, KERN_WARNING, "Couldn't " 6017 "remount RDWR because of unprocessed " 6018 "orphan inode list. Please " 6019 "umount/remount instead"); 6020 err = -EINVAL; 6021 goto restore_opts; 6022 } 6023 6024 /* 6025 * Mounting a RDONLY partition read-write, so reread 6026 * and store the current valid flag. (It may have 6027 * been changed by e2fsck since we originally mounted 6028 * the partition.) 6029 */ 6030 if (sbi->s_journal) { 6031 err = ext4_clear_journal_err(sb, es); 6032 if (err) 6033 goto restore_opts; 6034 } 6035 sbi->s_mount_state = (le16_to_cpu(es->s_state) & 6036 ~EXT4_FC_REPLAY); 6037 6038 err = ext4_setup_super(sb, es, 0); 6039 if (err) 6040 goto restore_opts; 6041 6042 sb->s_flags &= ~SB_RDONLY; 6043 if (ext4_has_feature_mmp(sb)) { 6044 err = ext4_multi_mount_protect(sb, 6045 le64_to_cpu(es->s_mmp_block)); 6046 if (err) 6047 goto restore_opts; 6048 } 6049#ifdef CONFIG_QUOTA 6050 enable_quota = 1; 6051#endif 6052 } 6053 } 6054 6055 /* 6056 * Handle creation of system zone data early because it can fail. 6057 * Releasing of existing data is done when we are sure remount will 6058 * succeed. 6059 */ 6060 if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { 6061 err = ext4_setup_system_zone(sb); 6062 if (err) 6063 goto restore_opts; 6064 } 6065 6066 if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { 6067 err = ext4_commit_super(sb); 6068 if (err) 6069 goto restore_opts; 6070 } 6071 6072#ifdef CONFIG_QUOTA 6073 if (enable_quota) { 6074 if (sb_any_quota_suspended(sb)) 6075 dquot_resume(sb, -1); 6076 else if (ext4_has_feature_quota(sb)) { 6077 err = ext4_enable_quotas(sb); 6078 if (err) 6079 goto restore_opts; 6080 } 6081 } 6082 /* Release old quota file names */ 6083 for (i = 0; i < EXT4_MAXQUOTAS; i++) 6084 kfree(old_opts.s_qf_names[i]); 6085#endif 6086 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) 6087 ext4_release_system_zone(sb); 6088 6089 /* 6090 * Reinitialize lazy itable initialization thread based on 6091 * current settings 6092 */ 6093 if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) 6094 ext4_unregister_li_request(sb); 6095 else { 6096 ext4_group_t first_not_zeroed; 6097 first_not_zeroed = ext4_has_uninit_itable(sb); 6098 ext4_register_li_request(sb, first_not_zeroed); 6099 } 6100 6101 if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) 6102 ext4_stop_mmpd(sbi); 6103 6104 /* 6105 * Some options can be enabled by ext4 and/or by VFS mount flag 6106 * either way we need to make sure it matches in both *flags and 6107 * s_flags. Copy those selected flags from s_flags to *flags 6108 */ 6109 *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); 6110 6111 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); 6112 kfree(orig_data); 6113 return 0; 6114 6115restore_opts: 6116 /* 6117 * If there was a failing r/w to ro transition, we may need to 6118 * re-enable quota 6119 */ 6120 if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && 6121 sb_any_quota_suspended(sb)) 6122 dquot_resume(sb, -1); 6123 sb->s_flags = old_sb_flags; 6124 sbi->s_mount_opt = old_opts.s_mount_opt; 6125 sbi->s_mount_opt2 = old_opts.s_mount_opt2; 6126 sbi->s_resuid = old_opts.s_resuid; 6127 sbi->s_resgid = old_opts.s_resgid; 6128 sbi->s_commit_interval = old_opts.s_commit_interval; 6129 sbi->s_min_batch_time = old_opts.s_min_batch_time; 6130 sbi->s_max_batch_time = old_opts.s_max_batch_time; 6131 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) 6132 ext4_release_system_zone(sb); 6133#ifdef CONFIG_QUOTA 6134 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 6135 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 6136 to_free[i] = get_qf_name(sb, sbi, i); 6137 rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); 6138 } 6139 synchronize_rcu(); 6140 for (i = 0; i < EXT4_MAXQUOTAS; i++) 6141 kfree(to_free[i]); 6142#endif 6143 if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) 6144 ext4_stop_mmpd(sbi); 6145 kfree(orig_data); 6146 return err; 6147} 6148 6149#ifdef CONFIG_QUOTA 6150static int ext4_statfs_project(struct super_block *sb, 6151 kprojid_t projid, struct kstatfs *buf) 6152{ 6153 struct kqid qid; 6154 struct dquot *dquot; 6155 u64 limit; 6156 u64 curblock; 6157 6158 qid = make_kqid_projid(projid); 6159 dquot = dqget(sb, qid); 6160 if (IS_ERR(dquot)) 6161 return PTR_ERR(dquot); 6162 spin_lock(&dquot->dq_dqb_lock); 6163 6164 limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, 6165 dquot->dq_dqb.dqb_bhardlimit); 6166 limit >>= sb->s_blocksize_bits; 6167 6168 if (limit && buf->f_blocks > limit) { 6169 curblock = (dquot->dq_dqb.dqb_curspace + 6170 dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; 6171 buf->f_blocks = limit; 6172 buf->f_bfree = buf->f_bavail = 6173 (buf->f_blocks > curblock) ? 6174 (buf->f_blocks - curblock) : 0; 6175 } 6176 6177 limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, 6178 dquot->dq_dqb.dqb_ihardlimit); 6179 if (limit && buf->f_files > limit) { 6180 buf->f_files = limit; 6181 buf->f_ffree = 6182 (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? 6183 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; 6184 } 6185 6186 spin_unlock(&dquot->dq_dqb_lock); 6187 dqput(dquot); 6188 return 0; 6189} 6190#endif 6191 6192static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) 6193{ 6194 struct super_block *sb = dentry->d_sb; 6195 struct ext4_sb_info *sbi = EXT4_SB(sb); 6196 struct ext4_super_block *es = sbi->s_es; 6197 ext4_fsblk_t overhead = 0, resv_blocks; 6198 u64 fsid; 6199 s64 bfree; 6200 resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); 6201 6202 if (!test_opt(sb, MINIX_DF)) 6203 overhead = sbi->s_overhead; 6204 6205 buf->f_type = EXT4_SUPER_MAGIC; 6206 buf->f_bsize = sb->s_blocksize; 6207 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); 6208 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - 6209 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 6210 /* prevent underflow in case that few free space is available */ 6211 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); 6212 buf->f_bavail = buf->f_bfree - 6213 (ext4_r_blocks_count(es) + resv_blocks); 6214 if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) 6215 buf->f_bavail = 0; 6216 buf->f_files = le32_to_cpu(es->s_inodes_count); 6217 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 6218 buf->f_namelen = EXT4_NAME_LEN; 6219 fsid = le64_to_cpup((void *)es->s_uuid) ^ 6220 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 6221 buf->f_fsid = u64_to_fsid(fsid); 6222 6223#ifdef CONFIG_QUOTA 6224 if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && 6225 sb_has_quota_limits_enabled(sb, PRJQUOTA)) 6226 ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); 6227#endif 6228 return 0; 6229} 6230 6231 6232#ifdef CONFIG_QUOTA 6233 6234/* 6235 * Helper functions so that transaction is started before we acquire dqio_sem 6236 * to keep correct lock ordering of transaction > dqio_sem 6237 */ 6238static inline struct inode *dquot_to_inode(struct dquot *dquot) 6239{ 6240 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; 6241} 6242 6243static int ext4_write_dquot(struct dquot *dquot) 6244{ 6245 int ret, err; 6246 handle_t *handle; 6247 struct inode *inode; 6248 6249 inode = dquot_to_inode(dquot); 6250 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 6251 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 6252 if (IS_ERR(handle)) 6253 return PTR_ERR(handle); 6254 ret = dquot_commit(dquot); 6255 err = ext4_journal_stop(handle); 6256 if (!ret) 6257 ret = err; 6258 return ret; 6259} 6260 6261static int ext4_acquire_dquot(struct dquot *dquot) 6262{ 6263 int ret, err; 6264 handle_t *handle; 6265 6266 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, 6267 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 6268 if (IS_ERR(handle)) 6269 return PTR_ERR(handle); 6270 ret = dquot_acquire(dquot); 6271 err = ext4_journal_stop(handle); 6272 if (!ret) 6273 ret = err; 6274 return ret; 6275} 6276 6277static int ext4_release_dquot(struct dquot *dquot) 6278{ 6279 int ret, err; 6280 handle_t *handle; 6281 6282 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, 6283 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 6284 if (IS_ERR(handle)) { 6285 /* Release dquot anyway to avoid endless cycle in dqput() */ 6286 dquot_release(dquot); 6287 return PTR_ERR(handle); 6288 } 6289 ret = dquot_release(dquot); 6290 err = ext4_journal_stop(handle); 6291 if (!ret) 6292 ret = err; 6293 return ret; 6294} 6295 6296static int ext4_mark_dquot_dirty(struct dquot *dquot) 6297{ 6298 struct super_block *sb = dquot->dq_sb; 6299 struct ext4_sb_info *sbi = EXT4_SB(sb); 6300 6301 /* Are we journaling quotas? */ 6302 if (ext4_has_feature_quota(sb) || 6303 sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 6304 dquot_mark_dquot_dirty(dquot); 6305 return ext4_write_dquot(dquot); 6306 } else { 6307 return dquot_mark_dquot_dirty(dquot); 6308 } 6309} 6310 6311static int ext4_write_info(struct super_block *sb, int type) 6312{ 6313 int ret, err; 6314 handle_t *handle; 6315 6316 /* Data block + inode block */ 6317 handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); 6318 if (IS_ERR(handle)) 6319 return PTR_ERR(handle); 6320 ret = dquot_commit_info(sb, type); 6321 err = ext4_journal_stop(handle); 6322 if (!ret) 6323 ret = err; 6324 return ret; 6325} 6326 6327/* 6328 * Turn on quotas during mount time - we need to find 6329 * the quota file and such... 6330 */ 6331static int ext4_quota_on_mount(struct super_block *sb, int type) 6332{ 6333 return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), 6334 EXT4_SB(sb)->s_jquota_fmt, type); 6335} 6336 6337static void lockdep_set_quota_inode(struct inode *inode, int subclass) 6338{ 6339 struct ext4_inode_info *ei = EXT4_I(inode); 6340 6341 /* The first argument of lockdep_set_subclass has to be 6342 * *exactly* the same as the argument to init_rwsem() --- in 6343 * this case, in init_once() --- or lockdep gets unhappy 6344 * because the name of the lock is set using the 6345 * stringification of the argument to init_rwsem(). 6346 */ 6347 (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ 6348 lockdep_set_subclass(&ei->i_data_sem, subclass); 6349} 6350 6351/* 6352 * Standard function to be called on quota_on 6353 */ 6354static int ext4_quota_on(struct super_block *sb, int type, int format_id, 6355 const struct path *path) 6356{ 6357 int err; 6358 6359 if (!test_opt(sb, QUOTA)) 6360 return -EINVAL; 6361 6362 /* Quotafile not on the same filesystem? */ 6363 if (path->dentry->d_sb != sb) 6364 return -EXDEV; 6365 6366 /* Quota already enabled for this file? */ 6367 if (IS_NOQUOTA(d_inode(path->dentry))) 6368 return -EBUSY; 6369 6370 /* Journaling quota? */ 6371 if (EXT4_SB(sb)->s_qf_names[type]) { 6372 /* Quotafile not in fs root? */ 6373 if (path->dentry->d_parent != sb->s_root) 6374 ext4_msg(sb, KERN_WARNING, 6375 "Quota file not on filesystem root. " 6376 "Journaled quota will not work"); 6377 sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; 6378 } else { 6379 /* 6380 * Clear the flag just in case mount options changed since 6381 * last time. 6382 */ 6383 sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; 6384 } 6385 6386 /* 6387 * When we journal data on quota file, we have to flush journal to see 6388 * all updates to the file when we bypass pagecache... 6389 */ 6390 if (EXT4_SB(sb)->s_journal && 6391 ext4_should_journal_data(d_inode(path->dentry))) { 6392 /* 6393 * We don't need to lock updates but journal_flush() could 6394 * otherwise be livelocked... 6395 */ 6396 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 6397 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 6398 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 6399 if (err) 6400 return err; 6401 } 6402 6403 lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); 6404 err = dquot_quota_on(sb, type, format_id, path); 6405 if (!err) { 6406 struct inode *inode = d_inode(path->dentry); 6407 handle_t *handle; 6408 6409 /* 6410 * Set inode flags to prevent userspace from messing with quota 6411 * files. If this fails, we return success anyway since quotas 6412 * are already enabled and this is not a hard failure. 6413 */ 6414 inode_lock(inode); 6415 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); 6416 if (IS_ERR(handle)) 6417 goto unlock_inode; 6418 EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; 6419 inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, 6420 S_NOATIME | S_IMMUTABLE); 6421 err = ext4_mark_inode_dirty(handle, inode); 6422 ext4_journal_stop(handle); 6423 unlock_inode: 6424 inode_unlock(inode); 6425 if (err) 6426 dquot_quota_off(sb, type); 6427 } 6428 if (err) 6429 lockdep_set_quota_inode(path->dentry->d_inode, 6430 I_DATA_SEM_NORMAL); 6431 return err; 6432} 6433 6434static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) 6435{ 6436 switch (type) { 6437 case USRQUOTA: 6438 return qf_inum == EXT4_USR_QUOTA_INO; 6439 case GRPQUOTA: 6440 return qf_inum == EXT4_GRP_QUOTA_INO; 6441 case PRJQUOTA: 6442 return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; 6443 default: 6444 BUG(); 6445 } 6446} 6447 6448static int ext4_quota_enable(struct super_block *sb, int type, int format_id, 6449 unsigned int flags) 6450{ 6451 int err; 6452 struct inode *qf_inode; 6453 unsigned long qf_inums[EXT4_MAXQUOTAS] = { 6454 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 6455 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), 6456 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) 6457 }; 6458 6459 BUG_ON(!ext4_has_feature_quota(sb)); 6460 6461 if (!qf_inums[type]) 6462 return -EPERM; 6463 6464 if (!ext4_check_quota_inum(type, qf_inums[type])) { 6465 ext4_error(sb, "Bad quota inum: %lu, type: %d", 6466 qf_inums[type], type); 6467 return -EUCLEAN; 6468 } 6469 6470 qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); 6471 if (IS_ERR(qf_inode)) { 6472 ext4_error(sb, "Bad quota inode: %lu, type: %d", 6473 qf_inums[type], type); 6474 return PTR_ERR(qf_inode); 6475 } 6476 6477 /* Don't account quota for quota files to avoid recursion */ 6478 qf_inode->i_flags |= S_NOQUOTA; 6479 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); 6480 err = dquot_load_quota_inode(qf_inode, type, format_id, flags); 6481 if (err) 6482 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); 6483 iput(qf_inode); 6484 6485 return err; 6486} 6487 6488/* Enable usage tracking for all quota types. */ 6489static int ext4_enable_quotas(struct super_block *sb) 6490{ 6491 int type, err = 0; 6492 unsigned long qf_inums[EXT4_MAXQUOTAS] = { 6493 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 6494 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), 6495 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) 6496 }; 6497 bool quota_mopt[EXT4_MAXQUOTAS] = { 6498 test_opt(sb, USRQUOTA), 6499 test_opt(sb, GRPQUOTA), 6500 test_opt(sb, PRJQUOTA), 6501 }; 6502 6503 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; 6504 for (type = 0; type < EXT4_MAXQUOTAS; type++) { 6505 if (qf_inums[type]) { 6506 err = ext4_quota_enable(sb, type, QFMT_VFS_V1, 6507 DQUOT_USAGE_ENABLED | 6508 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); 6509 if (err) { 6510 ext4_warning(sb, 6511 "Failed to enable quota tracking " 6512 "(type=%d, err=%d, ino=%lu). " 6513 "Please run e2fsck to fix.", type, 6514 err, qf_inums[type]); 6515 for (type--; type >= 0; type--) { 6516 struct inode *inode; 6517 6518 inode = sb_dqopt(sb)->files[type]; 6519 if (inode) 6520 inode = igrab(inode); 6521 dquot_quota_off(sb, type); 6522 if (inode) { 6523 lockdep_set_quota_inode(inode, 6524 I_DATA_SEM_NORMAL); 6525 iput(inode); 6526 } 6527 } 6528 6529 return err; 6530 } 6531 } 6532 } 6533 return 0; 6534} 6535 6536static int ext4_quota_off(struct super_block *sb, int type) 6537{ 6538 struct inode *inode = sb_dqopt(sb)->files[type]; 6539 handle_t *handle; 6540 int err; 6541 6542 /* Force all delayed allocation blocks to be allocated. 6543 * Caller already holds s_umount sem */ 6544 if (test_opt(sb, DELALLOC)) 6545 sync_filesystem(sb); 6546 6547 if (!inode || !igrab(inode)) 6548 goto out; 6549 6550 err = dquot_quota_off(sb, type); 6551 if (err || ext4_has_feature_quota(sb)) 6552 goto out_put; 6553 6554 inode_lock(inode); 6555 /* 6556 * Update modification times of quota files when userspace can 6557 * start looking at them. If we fail, we return success anyway since 6558 * this is not a hard failure and quotas are already disabled. 6559 */ 6560 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); 6561 if (IS_ERR(handle)) { 6562 err = PTR_ERR(handle); 6563 goto out_unlock; 6564 } 6565 EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); 6566 inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); 6567 inode->i_mtime = inode->i_ctime = current_time(inode); 6568 err = ext4_mark_inode_dirty(handle, inode); 6569 ext4_journal_stop(handle); 6570out_unlock: 6571 inode_unlock(inode); 6572out_put: 6573 lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); 6574 iput(inode); 6575 return err; 6576out: 6577 return dquot_quota_off(sb, type); 6578} 6579 6580/* Read data from quotafile - avoid pagecache and such because we cannot afford 6581 * acquiring the locks... As quota files are never truncated and quota code 6582 * itself serializes the operations (and no one else should touch the files) 6583 * we don't have to be afraid of races */ 6584static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 6585 size_t len, loff_t off) 6586{ 6587 struct inode *inode = sb_dqopt(sb)->files[type]; 6588 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 6589 int offset = off & (sb->s_blocksize - 1); 6590 int tocopy; 6591 size_t toread; 6592 struct buffer_head *bh; 6593 loff_t i_size = i_size_read(inode); 6594 6595 if (off > i_size) 6596 return 0; 6597 if (off+len > i_size) 6598 len = i_size-off; 6599 toread = len; 6600 while (toread > 0) { 6601 tocopy = sb->s_blocksize - offset < toread ? 6602 sb->s_blocksize - offset : toread; 6603 bh = ext4_bread(NULL, inode, blk, 0); 6604 if (IS_ERR(bh)) 6605 return PTR_ERR(bh); 6606 if (!bh) /* A hole? */ 6607 memset(data, 0, tocopy); 6608 else 6609 memcpy(data, bh->b_data+offset, tocopy); 6610 brelse(bh); 6611 offset = 0; 6612 toread -= tocopy; 6613 data += tocopy; 6614 blk++; 6615 } 6616 return len; 6617} 6618 6619/* Write to quotafile (we know the transaction is already started and has 6620 * enough credits) */ 6621static ssize_t ext4_quota_write(struct super_block *sb, int type, 6622 const char *data, size_t len, loff_t off) 6623{ 6624 struct inode *inode = sb_dqopt(sb)->files[type]; 6625 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 6626 int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); 6627 int retries = 0; 6628 struct buffer_head *bh; 6629 handle_t *handle = journal_current_handle(); 6630 6631 if (!handle) { 6632 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" 6633 " cancelled because transaction is not started", 6634 (unsigned long long)off, (unsigned long long)len); 6635 return -EIO; 6636 } 6637 /* 6638 * Since we account only one data block in transaction credits, 6639 * then it is impossible to cross a block boundary. 6640 */ 6641 if (sb->s_blocksize - offset < len) { 6642 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" 6643 " cancelled because not block aligned", 6644 (unsigned long long)off, (unsigned long long)len); 6645 return -EIO; 6646 } 6647 6648 do { 6649 bh = ext4_bread(handle, inode, blk, 6650 EXT4_GET_BLOCKS_CREATE | 6651 EXT4_GET_BLOCKS_METADATA_NOFAIL); 6652 } while (PTR_ERR(bh) == -ENOSPC && 6653 ext4_should_retry_alloc(inode->i_sb, &retries)); 6654 if (IS_ERR(bh)) 6655 return PTR_ERR(bh); 6656 if (!bh) 6657 goto out; 6658 BUFFER_TRACE(bh, "get write access"); 6659 err = ext4_journal_get_write_access(handle, bh); 6660 if (err) { 6661 brelse(bh); 6662 return err; 6663 } 6664 lock_buffer(bh); 6665 memcpy(bh->b_data+offset, data, len); 6666 flush_dcache_page(bh->b_page); 6667 unlock_buffer(bh); 6668 err = ext4_handle_dirty_metadata(handle, NULL, bh); 6669 brelse(bh); 6670out: 6671 if (inode->i_size < off + len) { 6672 i_size_write(inode, off + len); 6673 EXT4_I(inode)->i_disksize = inode->i_size; 6674 err2 = ext4_mark_inode_dirty(handle, inode); 6675 if (unlikely(err2 && !err)) 6676 err = err2; 6677 } 6678 return err ? err : len; 6679} 6680#endif 6681 6682static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 6683 const char *dev_name, void *data) 6684{ 6685 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); 6686} 6687 6688#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) 6689static inline void register_as_ext2(void) 6690{ 6691 int err = register_filesystem(&ext2_fs_type); 6692 if (err) 6693 printk(KERN_WARNING 6694 "EXT4-fs: Unable to register as ext2 (%d)\n", err); 6695} 6696 6697static inline void unregister_as_ext2(void) 6698{ 6699 unregister_filesystem(&ext2_fs_type); 6700} 6701 6702static inline int ext2_feature_set_ok(struct super_block *sb) 6703{ 6704 if (ext4_has_unknown_ext2_incompat_features(sb)) 6705 return 0; 6706 if (sb_rdonly(sb)) 6707 return 1; 6708 if (ext4_has_unknown_ext2_ro_compat_features(sb)) 6709 return 0; 6710 return 1; 6711} 6712#else 6713static inline void register_as_ext2(void) { } 6714static inline void unregister_as_ext2(void) { } 6715static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } 6716#endif 6717 6718static inline void register_as_ext3(void) 6719{ 6720 int err = register_filesystem(&ext3_fs_type); 6721 if (err) 6722 printk(KERN_WARNING 6723 "EXT4-fs: Unable to register as ext3 (%d)\n", err); 6724} 6725 6726static inline void unregister_as_ext3(void) 6727{ 6728 unregister_filesystem(&ext3_fs_type); 6729} 6730 6731static inline int ext3_feature_set_ok(struct super_block *sb) 6732{ 6733 if (ext4_has_unknown_ext3_incompat_features(sb)) 6734 return 0; 6735 if (!ext4_has_feature_journal(sb)) 6736 return 0; 6737 if (sb_rdonly(sb)) 6738 return 1; 6739 if (ext4_has_unknown_ext3_ro_compat_features(sb)) 6740 return 0; 6741 return 1; 6742} 6743 6744static struct file_system_type ext4_fs_type = { 6745 .owner = THIS_MODULE, 6746 .name = "ext4", 6747 .mount = ext4_mount, 6748 .kill_sb = kill_block_super, 6749 .fs_flags = FS_REQUIRES_DEV, 6750}; 6751MODULE_ALIAS_FS("ext4"); 6752 6753/* Shared across all ext4 file systems */ 6754wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 6755 6756static int __init ext4_init_fs(void) 6757{ 6758 int i, err; 6759 6760 ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); 6761 ext4_li_info = NULL; 6762 mutex_init(&ext4_li_mtx); 6763 6764 /* Build-time check for flags consistency */ 6765 ext4_check_flag_values(); 6766 6767 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) 6768 init_waitqueue_head(&ext4__ioend_wq[i]); 6769 6770 err = ext4_init_es(); 6771 if (err) 6772 return err; 6773 6774 err = ext4_init_pending(); 6775 if (err) 6776 goto out7; 6777 6778 err = ext4_init_post_read_processing(); 6779 if (err) 6780 goto out6; 6781 6782 err = ext4_init_pageio(); 6783 if (err) 6784 goto out5; 6785 6786 err = ext4_init_system_zone(); 6787 if (err) 6788 goto out4; 6789 6790 err = ext4_init_sysfs(); 6791 if (err) 6792 goto out3; 6793 6794 err = ext4_init_mballoc(); 6795 if (err) 6796 goto out2; 6797 err = init_inodecache(); 6798 if (err) 6799 goto out1; 6800 6801 err = ext4_fc_init_dentry_cache(); 6802 if (err) 6803 goto out05; 6804 6805 register_as_ext3(); 6806 register_as_ext2(); 6807 err = register_filesystem(&ext4_fs_type); 6808 if (err) 6809 goto out; 6810 6811 return 0; 6812out: 6813 unregister_as_ext2(); 6814 unregister_as_ext3(); 6815 ext4_fc_destroy_dentry_cache(); 6816out05: 6817 destroy_inodecache(); 6818out1: 6819 ext4_exit_mballoc(); 6820out2: 6821 ext4_exit_sysfs(); 6822out3: 6823 ext4_exit_system_zone(); 6824out4: 6825 ext4_exit_pageio(); 6826out5: 6827 ext4_exit_post_read_processing(); 6828out6: 6829 ext4_exit_pending(); 6830out7: 6831 ext4_exit_es(); 6832 6833 return err; 6834} 6835 6836static void __exit ext4_exit_fs(void) 6837{ 6838 ext4_destroy_lazyinit_thread(); 6839 unregister_as_ext2(); 6840 unregister_as_ext3(); 6841 unregister_filesystem(&ext4_fs_type); 6842 ext4_fc_destroy_dentry_cache(); 6843 destroy_inodecache(); 6844 ext4_exit_mballoc(); 6845 ext4_exit_sysfs(); 6846 ext4_exit_system_zone(); 6847 ext4_exit_pageio(); 6848 ext4_exit_post_read_processing(); 6849 ext4_exit_es(); 6850 ext4_exit_pending(); 6851} 6852 6853MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 6854MODULE_DESCRIPTION("Fourth Extended Filesystem"); 6855MODULE_LICENSE("GPL"); 6856MODULE_SOFTDEP("pre: crc32c"); 6857module_init(ext4_init_fs) 6858module_exit(ext4_exit_fs) 6859