1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 4 * Written by Alex Tomas <alex@clusterfs.com> 5 * 6 * Architecture independence: 7 * Copyright (c) 2005, Bull S.A. 8 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 9 */ 10 11/* 12 * Extents support for EXT4 13 * 14 * TODO: 15 * - ext4*_error() should be used in some situations 16 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 17 * - smart tree reduction 18 */ 19 20#include <linux/fs.h> 21#include <linux/time.h> 22#include <linux/jbd2.h> 23#include <linux/highuid.h> 24#include <linux/pagemap.h> 25#include <linux/quotaops.h> 26#include <linux/string.h> 27#include <linux/slab.h> 28#include <linux/uaccess.h> 29#include <linux/fiemap.h> 30#include <linux/backing-dev.h> 31#include <linux/iomap.h> 32#include "ext4_jbd2.h" 33#include "ext4_extents.h" 34#include "xattr.h" 35 36#include <trace/events/ext4.h> 37 38/* 39 * used by extent splitting. 40 */ 41#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 42 due to ENOSPC */ 43#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ 44#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ 45 46#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ 47#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ 48 49static __le32 ext4_extent_block_csum(struct inode *inode, 50 struct ext4_extent_header *eh) 51{ 52 struct ext4_inode_info *ei = EXT4_I(inode); 53 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 54 __u32 csum; 55 56 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, 57 EXT4_EXTENT_TAIL_OFFSET(eh)); 58 return cpu_to_le32(csum); 59} 60 61static int ext4_extent_block_csum_verify(struct inode *inode, 62 struct ext4_extent_header *eh) 63{ 64 struct ext4_extent_tail *et; 65 66 if (!ext4_has_metadata_csum(inode->i_sb)) 67 return 1; 68 69 et = find_ext4_extent_tail(eh); 70 if (et->et_checksum != ext4_extent_block_csum(inode, eh)) 71 return 0; 72 return 1; 73} 74 75static void ext4_extent_block_csum_set(struct inode *inode, 76 struct ext4_extent_header *eh) 77{ 78 struct ext4_extent_tail *et; 79 80 if (!ext4_has_metadata_csum(inode->i_sb)) 81 return; 82 83 et = find_ext4_extent_tail(eh); 84 et->et_checksum = ext4_extent_block_csum(inode, eh); 85} 86 87static int ext4_split_extent_at(handle_t *handle, 88 struct inode *inode, 89 struct ext4_ext_path **ppath, 90 ext4_lblk_t split, 91 int split_flag, 92 int flags); 93 94static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) 95{ 96 /* 97 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this 98 * moment, get_block can be called only for blocks inside i_size since 99 * page cache has been already dropped and writes are blocked by 100 * i_mutex. So we can safely drop the i_data_sem here. 101 */ 102 BUG_ON(EXT4_JOURNAL(inode) == NULL); 103 ext4_discard_preallocations(inode, 0); 104 up_write(&EXT4_I(inode)->i_data_sem); 105 *dropped = 1; 106 return 0; 107} 108 109/* 110 * Make sure 'handle' has at least 'check_cred' credits. If not, restart 111 * transaction with 'restart_cred' credits. The function drops i_data_sem 112 * when restarting transaction and gets it after transaction is restarted. 113 * 114 * The function returns 0 on success, 1 if transaction had to be restarted, 115 * and < 0 in case of fatal error. 116 */ 117int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, 118 int check_cred, int restart_cred, 119 int revoke_cred) 120{ 121 int ret; 122 int dropped = 0; 123 124 ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, 125 revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); 126 if (dropped) 127 down_write(&EXT4_I(inode)->i_data_sem); 128 return ret; 129} 130 131/* 132 * could return: 133 * - EROFS 134 * - ENOMEM 135 */ 136static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 137 struct ext4_ext_path *path) 138{ 139 int err = 0; 140 141 if (path->p_bh) { 142 /* path points to block */ 143 BUFFER_TRACE(path->p_bh, "get_write_access"); 144 err = ext4_journal_get_write_access(handle, path->p_bh); 145 /* 146 * The extent buffer's verified bit will be set again in 147 * __ext4_ext_dirty(). We could leave an inconsistent 148 * buffer if the extents updating procudure break off du 149 * to some error happens, force to check it again. 150 */ 151 if (!err) 152 clear_buffer_verified(path->p_bh); 153 } 154 /* path points to leaf/index in inode body */ 155 /* we use in-core data, no need to protect them */ 156 return err; 157} 158 159/* 160 * could return: 161 * - EROFS 162 * - ENOMEM 163 * - EIO 164 */ 165static int __ext4_ext_dirty(const char *where, unsigned int line, 166 handle_t *handle, struct inode *inode, 167 struct ext4_ext_path *path) 168{ 169 int err; 170 171 WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 172 if (path->p_bh) { 173 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); 174 /* path points to block */ 175 err = __ext4_handle_dirty_metadata(where, line, handle, 176 inode, path->p_bh); 177 /* Extents updating done, re-set verified flag */ 178 if (!err) 179 set_buffer_verified(path->p_bh); 180 } else { 181 /* path points to leaf/index in inode body */ 182 err = ext4_mark_inode_dirty(handle, inode); 183 } 184 return err; 185} 186 187#define ext4_ext_dirty(handle, inode, path) \ 188 __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) 189 190static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 191 struct ext4_ext_path *path, 192 ext4_lblk_t block) 193{ 194 if (path) { 195 int depth = path->p_depth; 196 struct ext4_extent *ex; 197 198 /* 199 * Try to predict block placement assuming that we are 200 * filling in a file which will eventually be 201 * non-sparse --- i.e., in the case of libbfd writing 202 * an ELF object sections out-of-order but in a way 203 * the eventually results in a contiguous object or 204 * executable file, or some database extending a table 205 * space file. However, this is actually somewhat 206 * non-ideal if we are writing a sparse file such as 207 * qemu or KVM writing a raw image file that is going 208 * to stay fairly sparse, since it will end up 209 * fragmenting the file system's free space. Maybe we 210 * should have some hueristics or some way to allow 211 * userspace to pass a hint to file system, 212 * especially if the latter case turns out to be 213 * common. 214 */ 215 ex = path[depth].p_ext; 216 if (ex) { 217 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 218 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 219 220 if (block > ext_block) 221 return ext_pblk + (block - ext_block); 222 else 223 return ext_pblk - (ext_block - block); 224 } 225 226 /* it looks like index is empty; 227 * try to find starting block from index itself */ 228 if (path[depth].p_bh) 229 return path[depth].p_bh->b_blocknr; 230 } 231 232 /* OK. use inode's group */ 233 return ext4_inode_to_goal_block(inode); 234} 235 236/* 237 * Allocation for a meta data block 238 */ 239static ext4_fsblk_t 240ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 241 struct ext4_ext_path *path, 242 struct ext4_extent *ex, int *err, unsigned int flags) 243{ 244 ext4_fsblk_t goal, newblock; 245 246 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 247 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 248 NULL, err); 249 return newblock; 250} 251 252static inline int ext4_ext_space_block(struct inode *inode, int check) 253{ 254 int size; 255 256 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 257 / sizeof(struct ext4_extent); 258#ifdef AGGRESSIVE_TEST 259 if (!check && size > 6) 260 size = 6; 261#endif 262 return size; 263} 264 265static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 266{ 267 int size; 268 269 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 270 / sizeof(struct ext4_extent_idx); 271#ifdef AGGRESSIVE_TEST 272 if (!check && size > 5) 273 size = 5; 274#endif 275 return size; 276} 277 278static inline int ext4_ext_space_root(struct inode *inode, int check) 279{ 280 int size; 281 282 size = sizeof(EXT4_I(inode)->i_data); 283 size -= sizeof(struct ext4_extent_header); 284 size /= sizeof(struct ext4_extent); 285#ifdef AGGRESSIVE_TEST 286 if (!check && size > 3) 287 size = 3; 288#endif 289 return size; 290} 291 292static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 293{ 294 int size; 295 296 size = sizeof(EXT4_I(inode)->i_data); 297 size -= sizeof(struct ext4_extent_header); 298 size /= sizeof(struct ext4_extent_idx); 299#ifdef AGGRESSIVE_TEST 300 if (!check && size > 4) 301 size = 4; 302#endif 303 return size; 304} 305 306static inline int 307ext4_force_split_extent_at(handle_t *handle, struct inode *inode, 308 struct ext4_ext_path **ppath, ext4_lblk_t lblk, 309 int nofail) 310{ 311 struct ext4_ext_path *path = *ppath; 312 int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); 313 int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; 314 315 if (nofail) 316 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; 317 318 return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? 319 EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, 320 flags); 321} 322 323static int 324ext4_ext_max_entries(struct inode *inode, int depth) 325{ 326 int max; 327 328 if (depth == ext_depth(inode)) { 329 if (depth == 0) 330 max = ext4_ext_space_root(inode, 1); 331 else 332 max = ext4_ext_space_root_idx(inode, 1); 333 } else { 334 if (depth == 0) 335 max = ext4_ext_space_block(inode, 1); 336 else 337 max = ext4_ext_space_block_idx(inode, 1); 338 } 339 340 return max; 341} 342 343static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 344{ 345 ext4_fsblk_t block = ext4_ext_pblock(ext); 346 int len = ext4_ext_get_actual_len(ext); 347 ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); 348 349 /* 350 * We allow neither: 351 * - zero length 352 * - overflow/wrap-around 353 */ 354 if (lblock + len <= lblock) 355 return 0; 356 return ext4_inode_block_valid(inode, block, len); 357} 358 359static int ext4_valid_extent_idx(struct inode *inode, 360 struct ext4_extent_idx *ext_idx) 361{ 362 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 363 364 return ext4_inode_block_valid(inode, block, 1); 365} 366 367static int ext4_valid_extent_entries(struct inode *inode, 368 struct ext4_extent_header *eh, 369 ext4_lblk_t lblk, ext4_fsblk_t *pblk, 370 int depth) 371{ 372 unsigned short entries; 373 ext4_lblk_t lblock = 0; 374 ext4_lblk_t cur = 0; 375 376 if (eh->eh_entries == 0) 377 return 1; 378 379 entries = le16_to_cpu(eh->eh_entries); 380 381 if (depth == 0) { 382 /* leaf entries */ 383 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 384 385 /* 386 * The logical block in the first entry should equal to 387 * the number in the index block. 388 */ 389 if (depth != ext_depth(inode) && 390 lblk != le32_to_cpu(ext->ee_block)) 391 return 0; 392 while (entries) { 393 if (!ext4_valid_extent(inode, ext)) 394 return 0; 395 396 /* Check for overlapping extents */ 397 lblock = le32_to_cpu(ext->ee_block); 398 if (lblock < cur) { 399 *pblk = ext4_ext_pblock(ext); 400 return 0; 401 } 402 cur = lblock + ext4_ext_get_actual_len(ext); 403 ext++; 404 entries--; 405 } 406 } else { 407 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); 408 409 /* 410 * The logical block in the first entry should equal to 411 * the number in the parent index block. 412 */ 413 if (depth != ext_depth(inode) && 414 lblk != le32_to_cpu(ext_idx->ei_block)) 415 return 0; 416 while (entries) { 417 if (!ext4_valid_extent_idx(inode, ext_idx)) 418 return 0; 419 420 /* Check for overlapping index extents */ 421 lblock = le32_to_cpu(ext_idx->ei_block); 422 if (lblock < cur) { 423 *pblk = ext4_idx_pblock(ext_idx); 424 return 0; 425 } 426 ext_idx++; 427 entries--; 428 cur = lblock + 1; 429 } 430 } 431 return 1; 432} 433 434static int __ext4_ext_check(const char *function, unsigned int line, 435 struct inode *inode, struct ext4_extent_header *eh, 436 int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) 437{ 438 const char *error_msg; 439 int max = 0, err = -EFSCORRUPTED; 440 441 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 442 error_msg = "invalid magic"; 443 goto corrupted; 444 } 445 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 446 error_msg = "unexpected eh_depth"; 447 goto corrupted; 448 } 449 if (unlikely(eh->eh_max == 0)) { 450 error_msg = "invalid eh_max"; 451 goto corrupted; 452 } 453 max = ext4_ext_max_entries(inode, depth); 454 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 455 error_msg = "too large eh_max"; 456 goto corrupted; 457 } 458 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 459 error_msg = "invalid eh_entries"; 460 goto corrupted; 461 } 462 if (unlikely((eh->eh_entries == 0) && (depth > 0))) { 463 error_msg = "eh_entries is 0 but eh_depth is > 0"; 464 goto corrupted; 465 } 466 if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { 467 error_msg = "invalid extent entries"; 468 goto corrupted; 469 } 470 if (unlikely(depth > 32)) { 471 error_msg = "too large eh_depth"; 472 goto corrupted; 473 } 474 /* Verify checksum on non-root extent tree nodes */ 475 if (ext_depth(inode) != depth && 476 !ext4_extent_block_csum_verify(inode, eh)) { 477 error_msg = "extent tree corrupted"; 478 err = -EFSBADCRC; 479 goto corrupted; 480 } 481 return 0; 482 483corrupted: 484 ext4_error_inode_err(inode, function, line, 0, -err, 485 "pblk %llu bad header/extent: %s - magic %x, " 486 "entries %u, max %u(%u), depth %u(%u)", 487 (unsigned long long) pblk, error_msg, 488 le16_to_cpu(eh->eh_magic), 489 le16_to_cpu(eh->eh_entries), 490 le16_to_cpu(eh->eh_max), 491 max, le16_to_cpu(eh->eh_depth), depth); 492 return err; 493} 494 495#define ext4_ext_check(inode, eh, depth, pblk) \ 496 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) 497 498int ext4_ext_check_inode(struct inode *inode) 499{ 500 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); 501} 502 503static void ext4_cache_extents(struct inode *inode, 504 struct ext4_extent_header *eh) 505{ 506 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); 507 ext4_lblk_t prev = 0; 508 int i; 509 510 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { 511 unsigned int status = EXTENT_STATUS_WRITTEN; 512 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); 513 int len = ext4_ext_get_actual_len(ex); 514 515 if (prev && (prev != lblk)) 516 ext4_es_cache_extent(inode, prev, lblk - prev, ~0, 517 EXTENT_STATUS_HOLE); 518 519 if (ext4_ext_is_unwritten(ex)) 520 status = EXTENT_STATUS_UNWRITTEN; 521 ext4_es_cache_extent(inode, lblk, len, 522 ext4_ext_pblock(ex), status); 523 prev = lblk + len; 524 } 525} 526 527static struct buffer_head * 528__read_extent_tree_block(const char *function, unsigned int line, 529 struct inode *inode, struct ext4_extent_idx *idx, 530 int depth, int flags) 531{ 532 struct buffer_head *bh; 533 int err; 534 gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; 535 ext4_fsblk_t pblk; 536 537 if (flags & EXT4_EX_NOFAIL) 538 gfp_flags |= __GFP_NOFAIL; 539 540 pblk = ext4_idx_pblock(idx); 541 bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); 542 if (unlikely(!bh)) 543 return ERR_PTR(-ENOMEM); 544 545 if (!bh_uptodate_or_lock(bh)) { 546 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); 547 err = ext4_read_bh(bh, 0, NULL); 548 if (err < 0) 549 goto errout; 550 } 551 if (buffer_verified(bh)) { 552 if (unlikely(ext_block_hdr(bh)->eh_magic != EXT4_EXT_MAGIC)) { 553 err = -EFSCORRUPTED; 554 ext4_error_inode(inode, function, line, 0, 555 "invalid magic for verified extent block %llu", 556 (unsigned long long)bh->b_blocknr); 557 goto errout; 558 } 559 560 if (!(flags & EXT4_EX_FORCE_CACHE)) 561 return bh; 562 } else { 563 err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), 564 depth, pblk, le32_to_cpu(idx->ei_block)); 565 if (err) 566 goto errout; 567 set_buffer_verified(bh); 568 } 569 /* 570 * If this is a leaf block, cache all of its entries 571 */ 572 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { 573 struct ext4_extent_header *eh = ext_block_hdr(bh); 574 ext4_cache_extents(inode, eh); 575 } 576 return bh; 577errout: 578 put_bh(bh); 579 return ERR_PTR(err); 580 581} 582 583#define read_extent_tree_block(inode, idx, depth, flags) \ 584 __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ 585 (depth), (flags)) 586 587/* 588 * This function is called to cache a file's extent information in the 589 * extent status tree 590 */ 591int ext4_ext_precache(struct inode *inode) 592{ 593 struct ext4_inode_info *ei = EXT4_I(inode); 594 struct ext4_ext_path *path = NULL; 595 struct buffer_head *bh; 596 int i = 0, depth, ret = 0; 597 598 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 599 return 0; /* not an extent-mapped inode */ 600 601 down_read(&ei->i_data_sem); 602 depth = ext_depth(inode); 603 604 /* Don't cache anything if there are no external extent blocks */ 605 if (!depth) { 606 up_read(&ei->i_data_sem); 607 return ret; 608 } 609 610 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), 611 GFP_NOFS); 612 if (path == NULL) { 613 up_read(&ei->i_data_sem); 614 return -ENOMEM; 615 } 616 617 path[0].p_hdr = ext_inode_hdr(inode); 618 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); 619 if (ret) 620 goto out; 621 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); 622 while (i >= 0) { 623 /* 624 * If this is a leaf block or we've reached the end of 625 * the index block, go up 626 */ 627 if ((i == depth) || 628 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { 629 brelse(path[i].p_bh); 630 path[i].p_bh = NULL; 631 i--; 632 continue; 633 } 634 bh = read_extent_tree_block(inode, path[i].p_idx++, 635 depth - i - 1, 636 EXT4_EX_FORCE_CACHE); 637 if (IS_ERR(bh)) { 638 ret = PTR_ERR(bh); 639 break; 640 } 641 i++; 642 path[i].p_bh = bh; 643 path[i].p_hdr = ext_block_hdr(bh); 644 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); 645 } 646 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); 647out: 648 up_read(&ei->i_data_sem); 649 ext4_ext_drop_refs(path); 650 kfree(path); 651 return ret; 652} 653 654#ifdef EXT_DEBUG 655static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 656{ 657 int k, l = path->p_depth; 658 659 ext_debug(inode, "path:"); 660 for (k = 0; k <= l; k++, path++) { 661 if (path->p_idx) { 662 ext_debug(inode, " %d->%llu", 663 le32_to_cpu(path->p_idx->ei_block), 664 ext4_idx_pblock(path->p_idx)); 665 } else if (path->p_ext) { 666 ext_debug(inode, " %d:[%d]%d:%llu ", 667 le32_to_cpu(path->p_ext->ee_block), 668 ext4_ext_is_unwritten(path->p_ext), 669 ext4_ext_get_actual_len(path->p_ext), 670 ext4_ext_pblock(path->p_ext)); 671 } else 672 ext_debug(inode, " []"); 673 } 674 ext_debug(inode, "\n"); 675} 676 677static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 678{ 679 int depth = ext_depth(inode); 680 struct ext4_extent_header *eh; 681 struct ext4_extent *ex; 682 int i; 683 684 if (!path) 685 return; 686 687 eh = path[depth].p_hdr; 688 ex = EXT_FIRST_EXTENT(eh); 689 690 ext_debug(inode, "Displaying leaf extents\n"); 691 692 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 693 ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 694 ext4_ext_is_unwritten(ex), 695 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 696 } 697 ext_debug(inode, "\n"); 698} 699 700static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, 701 ext4_fsblk_t newblock, int level) 702{ 703 int depth = ext_depth(inode); 704 struct ext4_extent *ex; 705 706 if (depth != level) { 707 struct ext4_extent_idx *idx; 708 idx = path[level].p_idx; 709 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { 710 ext_debug(inode, "%d: move %d:%llu in new index %llu\n", 711 level, le32_to_cpu(idx->ei_block), 712 ext4_idx_pblock(idx), newblock); 713 idx++; 714 } 715 716 return; 717 } 718 719 ex = path[depth].p_ext; 720 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { 721 ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", 722 le32_to_cpu(ex->ee_block), 723 ext4_ext_pblock(ex), 724 ext4_ext_is_unwritten(ex), 725 ext4_ext_get_actual_len(ex), 726 newblock); 727 ex++; 728 } 729} 730 731#else 732#define ext4_ext_show_path(inode, path) 733#define ext4_ext_show_leaf(inode, path) 734#define ext4_ext_show_move(inode, path, newblock, level) 735#endif 736 737void ext4_ext_drop_refs(struct ext4_ext_path *path) 738{ 739 int depth, i; 740 741 if (!path) 742 return; 743 depth = path->p_depth; 744 for (i = 0; i <= depth; i++, path++) { 745 brelse(path->p_bh); 746 path->p_bh = NULL; 747 } 748} 749 750/* 751 * ext4_ext_binsearch_idx: 752 * binary search for the closest index of the given block 753 * the header must be checked before calling this 754 */ 755static void 756ext4_ext_binsearch_idx(struct inode *inode, 757 struct ext4_ext_path *path, ext4_lblk_t block) 758{ 759 struct ext4_extent_header *eh = path->p_hdr; 760 struct ext4_extent_idx *r, *l, *m; 761 762 763 ext_debug(inode, "binsearch for %u(idx): ", block); 764 765 l = EXT_FIRST_INDEX(eh) + 1; 766 r = EXT_LAST_INDEX(eh); 767 while (l <= r) { 768 m = l + (r - l) / 2; 769 if (block < le32_to_cpu(m->ei_block)) 770 r = m - 1; 771 else 772 l = m + 1; 773 ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, 774 le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), 775 r, le32_to_cpu(r->ei_block)); 776 } 777 778 path->p_idx = l - 1; 779 ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), 780 ext4_idx_pblock(path->p_idx)); 781 782#ifdef CHECK_BINSEARCH 783 { 784 struct ext4_extent_idx *chix, *ix; 785 int k; 786 787 chix = ix = EXT_FIRST_INDEX(eh); 788 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 789 if (k != 0 && le32_to_cpu(ix->ei_block) <= 790 le32_to_cpu(ix[-1].ei_block)) { 791 printk(KERN_DEBUG "k=%d, ix=0x%p, " 792 "first=0x%p\n", k, 793 ix, EXT_FIRST_INDEX(eh)); 794 printk(KERN_DEBUG "%u <= %u\n", 795 le32_to_cpu(ix->ei_block), 796 le32_to_cpu(ix[-1].ei_block)); 797 } 798 BUG_ON(k && le32_to_cpu(ix->ei_block) 799 <= le32_to_cpu(ix[-1].ei_block)); 800 if (block < le32_to_cpu(ix->ei_block)) 801 break; 802 chix = ix; 803 } 804 BUG_ON(chix != path->p_idx); 805 } 806#endif 807 808} 809 810/* 811 * ext4_ext_binsearch: 812 * binary search for closest extent of the given block 813 * the header must be checked before calling this 814 */ 815static void 816ext4_ext_binsearch(struct inode *inode, 817 struct ext4_ext_path *path, ext4_lblk_t block) 818{ 819 struct ext4_extent_header *eh = path->p_hdr; 820 struct ext4_extent *r, *l, *m; 821 822 if (eh->eh_entries == 0) { 823 /* 824 * this leaf is empty: 825 * we get such a leaf in split/add case 826 */ 827 return; 828 } 829 830 ext_debug(inode, "binsearch for %u: ", block); 831 832 l = EXT_FIRST_EXTENT(eh) + 1; 833 r = EXT_LAST_EXTENT(eh); 834 835 while (l <= r) { 836 m = l + (r - l) / 2; 837 if (block < le32_to_cpu(m->ee_block)) 838 r = m - 1; 839 else 840 l = m + 1; 841 ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, 842 le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), 843 r, le32_to_cpu(r->ee_block)); 844 } 845 846 path->p_ext = l - 1; 847 ext_debug(inode, " -> %d:%llu:[%d]%d ", 848 le32_to_cpu(path->p_ext->ee_block), 849 ext4_ext_pblock(path->p_ext), 850 ext4_ext_is_unwritten(path->p_ext), 851 ext4_ext_get_actual_len(path->p_ext)); 852 853#ifdef CHECK_BINSEARCH 854 { 855 struct ext4_extent *chex, *ex; 856 int k; 857 858 chex = ex = EXT_FIRST_EXTENT(eh); 859 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 860 BUG_ON(k && le32_to_cpu(ex->ee_block) 861 <= le32_to_cpu(ex[-1].ee_block)); 862 if (block < le32_to_cpu(ex->ee_block)) 863 break; 864 chex = ex; 865 } 866 BUG_ON(chex != path->p_ext); 867 } 868#endif 869 870} 871 872void ext4_ext_tree_init(handle_t *handle, struct inode *inode) 873{ 874 struct ext4_extent_header *eh; 875 876 eh = ext_inode_hdr(inode); 877 eh->eh_depth = 0; 878 eh->eh_entries = 0; 879 eh->eh_magic = EXT4_EXT_MAGIC; 880 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 881 eh->eh_generation = 0; 882 ext4_mark_inode_dirty(handle, inode); 883} 884 885struct ext4_ext_path * 886ext4_find_extent(struct inode *inode, ext4_lblk_t block, 887 struct ext4_ext_path **orig_path, int flags) 888{ 889 struct ext4_extent_header *eh; 890 struct buffer_head *bh; 891 struct ext4_ext_path *path = orig_path ? *orig_path : NULL; 892 short int depth, i, ppos = 0; 893 int ret; 894 gfp_t gfp_flags = GFP_NOFS; 895 896 if (flags & EXT4_EX_NOFAIL) 897 gfp_flags |= __GFP_NOFAIL; 898 899 eh = ext_inode_hdr(inode); 900 depth = ext_depth(inode); 901 if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { 902 EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", 903 depth); 904 ret = -EFSCORRUPTED; 905 goto err; 906 } 907 908 if (path) { 909 ext4_ext_drop_refs(path); 910 if (depth > path[0].p_maxdepth) { 911 kfree(path); 912 *orig_path = path = NULL; 913 } 914 } 915 if (!path) { 916 /* account possible depth increase */ 917 path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), 918 gfp_flags); 919 if (unlikely(!path)) 920 return ERR_PTR(-ENOMEM); 921 path[0].p_maxdepth = depth + 1; 922 } 923 path[0].p_hdr = eh; 924 path[0].p_bh = NULL; 925 926 i = depth; 927 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) 928 ext4_cache_extents(inode, eh); 929 /* walk through the tree */ 930 while (i) { 931 ext_debug(inode, "depth %d: num %d, max %d\n", 932 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 933 934 ext4_ext_binsearch_idx(inode, path + ppos, block); 935 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 936 path[ppos].p_depth = i; 937 path[ppos].p_ext = NULL; 938 939 bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); 940 if (IS_ERR(bh)) { 941 ret = PTR_ERR(bh); 942 goto err; 943 } 944 945 eh = ext_block_hdr(bh); 946 ppos++; 947 path[ppos].p_bh = bh; 948 path[ppos].p_hdr = eh; 949 } 950 951 path[ppos].p_depth = i; 952 path[ppos].p_ext = NULL; 953 path[ppos].p_idx = NULL; 954 955 /* find extent */ 956 ext4_ext_binsearch(inode, path + ppos, block); 957 /* if not an empty leaf */ 958 if (path[ppos].p_ext) 959 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 960 961 ext4_ext_show_path(inode, path); 962 963 return path; 964 965err: 966 ext4_ext_drop_refs(path); 967 kfree(path); 968 if (orig_path) 969 *orig_path = NULL; 970 return ERR_PTR(ret); 971} 972 973/* 974 * ext4_ext_insert_index: 975 * insert new index [@logical;@ptr] into the block at @curp; 976 * check where to insert: before @curp or after @curp 977 */ 978static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 979 struct ext4_ext_path *curp, 980 int logical, ext4_fsblk_t ptr) 981{ 982 struct ext4_extent_idx *ix; 983 int len, err; 984 985 err = ext4_ext_get_access(handle, inode, curp); 986 if (err) 987 return err; 988 989 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 990 EXT4_ERROR_INODE(inode, 991 "logical %d == ei_block %d!", 992 logical, le32_to_cpu(curp->p_idx->ei_block)); 993 return -EFSCORRUPTED; 994 } 995 996 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 997 >= le16_to_cpu(curp->p_hdr->eh_max))) { 998 EXT4_ERROR_INODE(inode, 999 "eh_entries %d >= eh_max %d!", 1000 le16_to_cpu(curp->p_hdr->eh_entries), 1001 le16_to_cpu(curp->p_hdr->eh_max)); 1002 return -EFSCORRUPTED; 1003 } 1004 1005 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 1006 /* insert after */ 1007 ext_debug(inode, "insert new index %d after: %llu\n", 1008 logical, ptr); 1009 ix = curp->p_idx + 1; 1010 } else { 1011 /* insert before */ 1012 ext_debug(inode, "insert new index %d before: %llu\n", 1013 logical, ptr); 1014 ix = curp->p_idx; 1015 } 1016 1017 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 1018 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 1019 return -EFSCORRUPTED; 1020 } 1021 1022 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; 1023 BUG_ON(len < 0); 1024 if (len > 0) { 1025 ext_debug(inode, "insert new index %d: " 1026 "move %d indices from 0x%p to 0x%p\n", 1027 logical, len, ix, ix + 1); 1028 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); 1029 } 1030 1031 ix->ei_block = cpu_to_le32(logical); 1032 ext4_idx_store_pblock(ix, ptr); 1033 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 1034 1035 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 1036 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 1037 return -EFSCORRUPTED; 1038 } 1039 1040 err = ext4_ext_dirty(handle, inode, curp); 1041 ext4_std_error(inode->i_sb, err); 1042 1043 return err; 1044} 1045 1046/* 1047 * ext4_ext_split: 1048 * inserts new subtree into the path, using free index entry 1049 * at depth @at: 1050 * - allocates all needed blocks (new leaf and all intermediate index blocks) 1051 * - makes decision where to split 1052 * - moves remaining extents and index entries (right to the split point) 1053 * into the newly allocated blocks 1054 * - initializes subtree 1055 */ 1056static int ext4_ext_split(handle_t *handle, struct inode *inode, 1057 unsigned int flags, 1058 struct ext4_ext_path *path, 1059 struct ext4_extent *newext, int at) 1060{ 1061 struct buffer_head *bh = NULL; 1062 int depth = ext_depth(inode); 1063 struct ext4_extent_header *neh; 1064 struct ext4_extent_idx *fidx; 1065 int i = at, k, m, a; 1066 ext4_fsblk_t newblock, oldblock; 1067 __le32 border; 1068 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 1069 gfp_t gfp_flags = GFP_NOFS; 1070 int err = 0; 1071 size_t ext_size = 0; 1072 1073 if (flags & EXT4_EX_NOFAIL) 1074 gfp_flags |= __GFP_NOFAIL; 1075 1076 /* make decision: where to split? */ 1077 /* FIXME: now decision is simplest: at current extent */ 1078 1079 /* if current leaf will be split, then we should use 1080 * border from split point */ 1081 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 1082 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 1083 return -EFSCORRUPTED; 1084 } 1085 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 1086 border = path[depth].p_ext[1].ee_block; 1087 ext_debug(inode, "leaf will be split." 1088 " next leaf starts at %d\n", 1089 le32_to_cpu(border)); 1090 } else { 1091 border = newext->ee_block; 1092 ext_debug(inode, "leaf will be added." 1093 " next leaf starts at %d\n", 1094 le32_to_cpu(border)); 1095 } 1096 1097 /* 1098 * If error occurs, then we break processing 1099 * and mark filesystem read-only. index won't 1100 * be inserted and tree will be in consistent 1101 * state. Next mount will repair buffers too. 1102 */ 1103 1104 /* 1105 * Get array to track all allocated blocks. 1106 * We need this to handle errors and free blocks 1107 * upon them. 1108 */ 1109 ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); 1110 if (!ablocks) 1111 return -ENOMEM; 1112 1113 /* allocate all needed blocks */ 1114 ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); 1115 for (a = 0; a < depth - at; a++) { 1116 newblock = ext4_ext_new_meta_block(handle, inode, path, 1117 newext, &err, flags); 1118 if (newblock == 0) 1119 goto cleanup; 1120 ablocks[a] = newblock; 1121 } 1122 1123 /* initialize new leaf */ 1124 newblock = ablocks[--a]; 1125 if (unlikely(newblock == 0)) { 1126 EXT4_ERROR_INODE(inode, "newblock == 0!"); 1127 err = -EFSCORRUPTED; 1128 goto cleanup; 1129 } 1130 bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); 1131 if (unlikely(!bh)) { 1132 err = -ENOMEM; 1133 goto cleanup; 1134 } 1135 lock_buffer(bh); 1136 1137 err = ext4_journal_get_create_access(handle, bh); 1138 if (err) 1139 goto cleanup; 1140 1141 neh = ext_block_hdr(bh); 1142 neh->eh_entries = 0; 1143 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1144 neh->eh_magic = EXT4_EXT_MAGIC; 1145 neh->eh_depth = 0; 1146 neh->eh_generation = 0; 1147 1148 /* move remainder of path[depth] to the new leaf */ 1149 if (unlikely(path[depth].p_hdr->eh_entries != 1150 path[depth].p_hdr->eh_max)) { 1151 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 1152 path[depth].p_hdr->eh_entries, 1153 path[depth].p_hdr->eh_max); 1154 err = -EFSCORRUPTED; 1155 goto cleanup; 1156 } 1157 /* start copy from next extent */ 1158 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; 1159 ext4_ext_show_move(inode, path, newblock, depth); 1160 if (m) { 1161 struct ext4_extent *ex; 1162 ex = EXT_FIRST_EXTENT(neh); 1163 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); 1164 le16_add_cpu(&neh->eh_entries, m); 1165 } 1166 1167 /* zero out unused area in the extent block */ 1168 ext_size = sizeof(struct ext4_extent_header) + 1169 sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); 1170 memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); 1171 ext4_extent_block_csum_set(inode, neh); 1172 set_buffer_uptodate(bh); 1173 unlock_buffer(bh); 1174 1175 err = ext4_handle_dirty_metadata(handle, inode, bh); 1176 if (err) 1177 goto cleanup; 1178 brelse(bh); 1179 bh = NULL; 1180 1181 /* correct old leaf */ 1182 if (m) { 1183 err = ext4_ext_get_access(handle, inode, path + depth); 1184 if (err) 1185 goto cleanup; 1186 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 1187 err = ext4_ext_dirty(handle, inode, path + depth); 1188 if (err) 1189 goto cleanup; 1190 1191 } 1192 1193 /* create intermediate indexes */ 1194 k = depth - at - 1; 1195 if (unlikely(k < 0)) { 1196 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 1197 err = -EFSCORRUPTED; 1198 goto cleanup; 1199 } 1200 if (k) 1201 ext_debug(inode, "create %d intermediate indices\n", k); 1202 /* insert new index into current index block */ 1203 /* current depth stored in i var */ 1204 i = depth - 1; 1205 while (k--) { 1206 oldblock = newblock; 1207 newblock = ablocks[--a]; 1208 bh = sb_getblk(inode->i_sb, newblock); 1209 if (unlikely(!bh)) { 1210 err = -ENOMEM; 1211 goto cleanup; 1212 } 1213 lock_buffer(bh); 1214 1215 err = ext4_journal_get_create_access(handle, bh); 1216 if (err) 1217 goto cleanup; 1218 1219 neh = ext_block_hdr(bh); 1220 neh->eh_entries = cpu_to_le16(1); 1221 neh->eh_magic = EXT4_EXT_MAGIC; 1222 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1223 neh->eh_depth = cpu_to_le16(depth - i); 1224 neh->eh_generation = 0; 1225 fidx = EXT_FIRST_INDEX(neh); 1226 fidx->ei_block = border; 1227 ext4_idx_store_pblock(fidx, oldblock); 1228 1229 ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", 1230 i, newblock, le32_to_cpu(border), oldblock); 1231 1232 /* move remainder of path[i] to the new index block */ 1233 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1234 EXT_LAST_INDEX(path[i].p_hdr))) { 1235 EXT4_ERROR_INODE(inode, 1236 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1237 le32_to_cpu(path[i].p_ext->ee_block)); 1238 err = -EFSCORRUPTED; 1239 goto cleanup; 1240 } 1241 /* start copy indexes */ 1242 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; 1243 ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, 1244 EXT_MAX_INDEX(path[i].p_hdr)); 1245 ext4_ext_show_move(inode, path, newblock, i); 1246 if (m) { 1247 memmove(++fidx, path[i].p_idx, 1248 sizeof(struct ext4_extent_idx) * m); 1249 le16_add_cpu(&neh->eh_entries, m); 1250 } 1251 /* zero out unused area in the extent block */ 1252 ext_size = sizeof(struct ext4_extent_header) + 1253 (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); 1254 memset(bh->b_data + ext_size, 0, 1255 inode->i_sb->s_blocksize - ext_size); 1256 ext4_extent_block_csum_set(inode, neh); 1257 set_buffer_uptodate(bh); 1258 unlock_buffer(bh); 1259 1260 err = ext4_handle_dirty_metadata(handle, inode, bh); 1261 if (err) 1262 goto cleanup; 1263 brelse(bh); 1264 bh = NULL; 1265 1266 /* correct old index */ 1267 if (m) { 1268 err = ext4_ext_get_access(handle, inode, path + i); 1269 if (err) 1270 goto cleanup; 1271 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1272 err = ext4_ext_dirty(handle, inode, path + i); 1273 if (err) 1274 goto cleanup; 1275 } 1276 1277 i--; 1278 } 1279 1280 /* insert new index */ 1281 err = ext4_ext_insert_index(handle, inode, path + at, 1282 le32_to_cpu(border), newblock); 1283 1284cleanup: 1285 if (bh) { 1286 if (buffer_locked(bh)) 1287 unlock_buffer(bh); 1288 brelse(bh); 1289 } 1290 1291 if (err) { 1292 /* free all allocated blocks in error case */ 1293 for (i = 0; i < depth; i++) { 1294 if (!ablocks[i]) 1295 continue; 1296 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1297 EXT4_FREE_BLOCKS_METADATA); 1298 } 1299 } 1300 kfree(ablocks); 1301 1302 return err; 1303} 1304 1305/* 1306 * ext4_ext_grow_indepth: 1307 * implements tree growing procedure: 1308 * - allocates new block 1309 * - moves top-level data (index block or leaf) into the new block 1310 * - initializes new top-level, creating index that points to the 1311 * just created block 1312 */ 1313static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1314 unsigned int flags) 1315{ 1316 struct ext4_extent_header *neh; 1317 struct buffer_head *bh; 1318 ext4_fsblk_t newblock, goal = 0; 1319 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 1320 int err = 0; 1321 size_t ext_size = 0; 1322 1323 /* Try to prepend new index to old one */ 1324 if (ext_depth(inode)) 1325 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); 1326 if (goal > le32_to_cpu(es->s_first_data_block)) { 1327 flags |= EXT4_MB_HINT_TRY_GOAL; 1328 goal--; 1329 } else 1330 goal = ext4_inode_to_goal_block(inode); 1331 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 1332 NULL, &err); 1333 if (newblock == 0) 1334 return err; 1335 1336 bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); 1337 if (unlikely(!bh)) 1338 return -ENOMEM; 1339 lock_buffer(bh); 1340 1341 err = ext4_journal_get_create_access(handle, bh); 1342 if (err) { 1343 unlock_buffer(bh); 1344 goto out; 1345 } 1346 1347 ext_size = sizeof(EXT4_I(inode)->i_data); 1348 /* move top-level index/leaf into new block */ 1349 memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); 1350 /* zero out unused area in the extent block */ 1351 memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); 1352 1353 /* set size of new block */ 1354 neh = ext_block_hdr(bh); 1355 /* old root could have indexes or leaves 1356 * so calculate e_max right way */ 1357 if (ext_depth(inode)) 1358 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1359 else 1360 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1361 neh->eh_magic = EXT4_EXT_MAGIC; 1362 ext4_extent_block_csum_set(inode, neh); 1363 set_buffer_uptodate(bh); 1364 unlock_buffer(bh); 1365 1366 err = ext4_handle_dirty_metadata(handle, inode, bh); 1367 if (err) 1368 goto out; 1369 1370 /* Update top-level index: num,max,pointer */ 1371 neh = ext_inode_hdr(inode); 1372 neh->eh_entries = cpu_to_le16(1); 1373 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); 1374 if (neh->eh_depth == 0) { 1375 /* Root extent block becomes index block */ 1376 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1377 EXT_FIRST_INDEX(neh)->ei_block = 1378 EXT_FIRST_EXTENT(neh)->ee_block; 1379 } 1380 ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", 1381 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1382 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1383 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1384 1385 le16_add_cpu(&neh->eh_depth, 1); 1386 err = ext4_mark_inode_dirty(handle, inode); 1387out: 1388 brelse(bh); 1389 1390 return err; 1391} 1392 1393/* 1394 * ext4_ext_create_new_leaf: 1395 * finds empty index and adds new leaf. 1396 * if no free index is found, then it requests in-depth growing. 1397 */ 1398static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1399 unsigned int mb_flags, 1400 unsigned int gb_flags, 1401 struct ext4_ext_path **ppath, 1402 struct ext4_extent *newext) 1403{ 1404 struct ext4_ext_path *path = *ppath; 1405 struct ext4_ext_path *curp; 1406 int depth, i, err = 0; 1407 1408repeat: 1409 i = depth = ext_depth(inode); 1410 1411 /* walk up to the tree and look for free index entry */ 1412 curp = path + depth; 1413 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1414 i--; 1415 curp--; 1416 } 1417 1418 /* we use already allocated block for index block, 1419 * so subsequent data blocks should be contiguous */ 1420 if (EXT_HAS_FREE_INDEX(curp)) { 1421 /* if we found index with free entry, then use that 1422 * entry: create all needed subtree and add new leaf */ 1423 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); 1424 if (err) 1425 goto out; 1426 1427 /* refill path */ 1428 path = ext4_find_extent(inode, 1429 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1430 ppath, gb_flags); 1431 if (IS_ERR(path)) 1432 err = PTR_ERR(path); 1433 } else { 1434 /* tree is full, time to grow in depth */ 1435 err = ext4_ext_grow_indepth(handle, inode, mb_flags); 1436 if (err) 1437 goto out; 1438 1439 /* refill path */ 1440 path = ext4_find_extent(inode, 1441 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1442 ppath, gb_flags); 1443 if (IS_ERR(path)) { 1444 err = PTR_ERR(path); 1445 goto out; 1446 } 1447 1448 /* 1449 * only first (depth 0 -> 1) produces free space; 1450 * in all other cases we have to split the grown tree 1451 */ 1452 depth = ext_depth(inode); 1453 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1454 /* now we need to split */ 1455 goto repeat; 1456 } 1457 } 1458 1459out: 1460 return err; 1461} 1462 1463/* 1464 * search the closest allocated block to the left for *logical 1465 * and returns it at @logical + it's physical address at @phys 1466 * if *logical is the smallest allocated block, the function 1467 * returns 0 at @phys 1468 * return value contains 0 (success) or error code 1469 */ 1470static int ext4_ext_search_left(struct inode *inode, 1471 struct ext4_ext_path *path, 1472 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1473{ 1474 struct ext4_extent_idx *ix; 1475 struct ext4_extent *ex; 1476 int depth, ee_len; 1477 1478 if (unlikely(path == NULL)) { 1479 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1480 return -EFSCORRUPTED; 1481 } 1482 depth = path->p_depth; 1483 *phys = 0; 1484 1485 if (depth == 0 && path->p_ext == NULL) 1486 return 0; 1487 1488 /* usually extent in the path covers blocks smaller 1489 * then *logical, but it can be that extent is the 1490 * first one in the file */ 1491 1492 ex = path[depth].p_ext; 1493 ee_len = ext4_ext_get_actual_len(ex); 1494 if (*logical < le32_to_cpu(ex->ee_block)) { 1495 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1496 EXT4_ERROR_INODE(inode, 1497 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1498 *logical, le32_to_cpu(ex->ee_block)); 1499 return -EFSCORRUPTED; 1500 } 1501 while (--depth >= 0) { 1502 ix = path[depth].p_idx; 1503 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1504 EXT4_ERROR_INODE(inode, 1505 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1506 ix != NULL ? le32_to_cpu(ix->ei_block) : 0, 1507 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1508 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, 1509 depth); 1510 return -EFSCORRUPTED; 1511 } 1512 } 1513 return 0; 1514 } 1515 1516 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1517 EXT4_ERROR_INODE(inode, 1518 "logical %d < ee_block %d + ee_len %d!", 1519 *logical, le32_to_cpu(ex->ee_block), ee_len); 1520 return -EFSCORRUPTED; 1521 } 1522 1523 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1524 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1525 return 0; 1526} 1527 1528/* 1529 * Search the closest allocated block to the right for *logical 1530 * and returns it at @logical + it's physical address at @phys. 1531 * If not exists, return 0 and @phys is set to 0. We will return 1532 * 1 which means we found an allocated block and ret_ex is valid. 1533 * Or return a (< 0) error code. 1534 */ 1535static int ext4_ext_search_right(struct inode *inode, 1536 struct ext4_ext_path *path, 1537 ext4_lblk_t *logical, ext4_fsblk_t *phys, 1538 struct ext4_extent *ret_ex) 1539{ 1540 struct buffer_head *bh = NULL; 1541 struct ext4_extent_header *eh; 1542 struct ext4_extent_idx *ix; 1543 struct ext4_extent *ex; 1544 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1545 int ee_len; 1546 1547 if (unlikely(path == NULL)) { 1548 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1549 return -EFSCORRUPTED; 1550 } 1551 depth = path->p_depth; 1552 *phys = 0; 1553 1554 if (depth == 0 && path->p_ext == NULL) 1555 return 0; 1556 1557 /* usually extent in the path covers blocks smaller 1558 * then *logical, but it can be that extent is the 1559 * first one in the file */ 1560 1561 ex = path[depth].p_ext; 1562 ee_len = ext4_ext_get_actual_len(ex); 1563 if (*logical < le32_to_cpu(ex->ee_block)) { 1564 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1565 EXT4_ERROR_INODE(inode, 1566 "first_extent(path[%d].p_hdr) != ex", 1567 depth); 1568 return -EFSCORRUPTED; 1569 } 1570 while (--depth >= 0) { 1571 ix = path[depth].p_idx; 1572 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1573 EXT4_ERROR_INODE(inode, 1574 "ix != EXT_FIRST_INDEX *logical %d!", 1575 *logical); 1576 return -EFSCORRUPTED; 1577 } 1578 } 1579 goto found_extent; 1580 } 1581 1582 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1583 EXT4_ERROR_INODE(inode, 1584 "logical %d < ee_block %d + ee_len %d!", 1585 *logical, le32_to_cpu(ex->ee_block), ee_len); 1586 return -EFSCORRUPTED; 1587 } 1588 1589 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1590 /* next allocated block in this leaf */ 1591 ex++; 1592 goto found_extent; 1593 } 1594 1595 /* go up and search for index to the right */ 1596 while (--depth >= 0) { 1597 ix = path[depth].p_idx; 1598 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1599 goto got_index; 1600 } 1601 1602 /* we've gone up to the root and found no index to the right */ 1603 return 0; 1604 1605got_index: 1606 /* we've found index to the right, let's 1607 * follow it and find the closest allocated 1608 * block to the right */ 1609 ix++; 1610 while (++depth < path->p_depth) { 1611 /* subtract from p_depth to get proper eh_depth */ 1612 bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); 1613 if (IS_ERR(bh)) 1614 return PTR_ERR(bh); 1615 eh = ext_block_hdr(bh); 1616 ix = EXT_FIRST_INDEX(eh); 1617 put_bh(bh); 1618 } 1619 1620 bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); 1621 if (IS_ERR(bh)) 1622 return PTR_ERR(bh); 1623 eh = ext_block_hdr(bh); 1624 ex = EXT_FIRST_EXTENT(eh); 1625found_extent: 1626 *logical = le32_to_cpu(ex->ee_block); 1627 *phys = ext4_ext_pblock(ex); 1628 if (ret_ex) 1629 *ret_ex = *ex; 1630 if (bh) 1631 put_bh(bh); 1632 return 1; 1633} 1634 1635/* 1636 * ext4_ext_next_allocated_block: 1637 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. 1638 * NOTE: it considers block number from index entry as 1639 * allocated block. Thus, index entries have to be consistent 1640 * with leaves. 1641 */ 1642ext4_lblk_t 1643ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1644{ 1645 int depth; 1646 1647 BUG_ON(path == NULL); 1648 depth = path->p_depth; 1649 1650 if (depth == 0 && path->p_ext == NULL) 1651 return EXT_MAX_BLOCKS; 1652 1653 while (depth >= 0) { 1654 struct ext4_ext_path *p = &path[depth]; 1655 1656 if (depth == path->p_depth) { 1657 /* leaf */ 1658 if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) 1659 return le32_to_cpu(p->p_ext[1].ee_block); 1660 } else { 1661 /* index */ 1662 if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) 1663 return le32_to_cpu(p->p_idx[1].ei_block); 1664 } 1665 depth--; 1666 } 1667 1668 return EXT_MAX_BLOCKS; 1669} 1670 1671/* 1672 * ext4_ext_next_leaf_block: 1673 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1674 */ 1675static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) 1676{ 1677 int depth; 1678 1679 BUG_ON(path == NULL); 1680 depth = path->p_depth; 1681 1682 /* zero-tree has no leaf blocks at all */ 1683 if (depth == 0) 1684 return EXT_MAX_BLOCKS; 1685 1686 /* go to index block */ 1687 depth--; 1688 1689 while (depth >= 0) { 1690 if (path[depth].p_idx != 1691 EXT_LAST_INDEX(path[depth].p_hdr)) 1692 return (ext4_lblk_t) 1693 le32_to_cpu(path[depth].p_idx[1].ei_block); 1694 depth--; 1695 } 1696 1697 return EXT_MAX_BLOCKS; 1698} 1699 1700/* 1701 * ext4_ext_correct_indexes: 1702 * if leaf gets modified and modified extent is first in the leaf, 1703 * then we have to correct all indexes above. 1704 * TODO: do we need to correct tree in all cases? 1705 */ 1706static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1707 struct ext4_ext_path *path) 1708{ 1709 struct ext4_extent_header *eh; 1710 int depth = ext_depth(inode); 1711 struct ext4_extent *ex; 1712 __le32 border; 1713 int k, err = 0; 1714 1715 eh = path[depth].p_hdr; 1716 ex = path[depth].p_ext; 1717 1718 if (unlikely(ex == NULL || eh == NULL)) { 1719 EXT4_ERROR_INODE(inode, 1720 "ex %p == NULL or eh %p == NULL", ex, eh); 1721 return -EFSCORRUPTED; 1722 } 1723 1724 if (depth == 0) { 1725 /* there is no tree at all */ 1726 return 0; 1727 } 1728 1729 if (ex != EXT_FIRST_EXTENT(eh)) { 1730 /* we correct tree if first leaf got modified only */ 1731 return 0; 1732 } 1733 1734 /* 1735 * TODO: we need correction if border is smaller than current one 1736 */ 1737 k = depth - 1; 1738 border = path[depth].p_ext->ee_block; 1739 err = ext4_ext_get_access(handle, inode, path + k); 1740 if (err) 1741 return err; 1742 path[k].p_idx->ei_block = border; 1743 err = ext4_ext_dirty(handle, inode, path + k); 1744 if (err) 1745 return err; 1746 1747 while (k--) { 1748 /* change all left-side indexes */ 1749 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1750 break; 1751 err = ext4_ext_get_access(handle, inode, path + k); 1752 if (err) 1753 break; 1754 path[k].p_idx->ei_block = border; 1755 err = ext4_ext_dirty(handle, inode, path + k); 1756 if (err) 1757 break; 1758 } 1759 1760 return err; 1761} 1762 1763static int ext4_can_extents_be_merged(struct inode *inode, 1764 struct ext4_extent *ex1, 1765 struct ext4_extent *ex2) 1766{ 1767 unsigned short ext1_ee_len, ext2_ee_len; 1768 1769 if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) 1770 return 0; 1771 1772 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1773 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1774 1775 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1776 le32_to_cpu(ex2->ee_block)) 1777 return 0; 1778 1779 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1780 return 0; 1781 1782 if (ext4_ext_is_unwritten(ex1) && 1783 ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) 1784 return 0; 1785#ifdef AGGRESSIVE_TEST 1786 if (ext1_ee_len >= 4) 1787 return 0; 1788#endif 1789 1790 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1791 return 1; 1792 return 0; 1793} 1794 1795/* 1796 * This function tries to merge the "ex" extent to the next extent in the tree. 1797 * It always tries to merge towards right. If you want to merge towards 1798 * left, pass "ex - 1" as argument instead of "ex". 1799 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1800 * 1 if they got merged. 1801 */ 1802static int ext4_ext_try_to_merge_right(struct inode *inode, 1803 struct ext4_ext_path *path, 1804 struct ext4_extent *ex) 1805{ 1806 struct ext4_extent_header *eh; 1807 unsigned int depth, len; 1808 int merge_done = 0, unwritten; 1809 1810 depth = ext_depth(inode); 1811 BUG_ON(path[depth].p_hdr == NULL); 1812 eh = path[depth].p_hdr; 1813 1814 while (ex < EXT_LAST_EXTENT(eh)) { 1815 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1816 break; 1817 /* merge with next extent! */ 1818 unwritten = ext4_ext_is_unwritten(ex); 1819 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1820 + ext4_ext_get_actual_len(ex + 1)); 1821 if (unwritten) 1822 ext4_ext_mark_unwritten(ex); 1823 1824 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1825 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1826 * sizeof(struct ext4_extent); 1827 memmove(ex + 1, ex + 2, len); 1828 } 1829 le16_add_cpu(&eh->eh_entries, -1); 1830 merge_done = 1; 1831 WARN_ON(eh->eh_entries == 0); 1832 if (!eh->eh_entries) 1833 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1834 } 1835 1836 return merge_done; 1837} 1838 1839/* 1840 * This function does a very simple check to see if we can collapse 1841 * an extent tree with a single extent tree leaf block into the inode. 1842 */ 1843static void ext4_ext_try_to_merge_up(handle_t *handle, 1844 struct inode *inode, 1845 struct ext4_ext_path *path) 1846{ 1847 size_t s; 1848 unsigned max_root = ext4_ext_space_root(inode, 0); 1849 ext4_fsblk_t blk; 1850 1851 if ((path[0].p_depth != 1) || 1852 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || 1853 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 1854 return; 1855 1856 /* 1857 * We need to modify the block allocation bitmap and the block 1858 * group descriptor to release the extent tree block. If we 1859 * can't get the journal credits, give up. 1860 */ 1861 if (ext4_journal_extend(handle, 2, 1862 ext4_free_metadata_revoke_credits(inode->i_sb, 1))) 1863 return; 1864 1865 /* 1866 * Copy the extent data up to the inode 1867 */ 1868 blk = ext4_idx_pblock(path[0].p_idx); 1869 s = le16_to_cpu(path[1].p_hdr->eh_entries) * 1870 sizeof(struct ext4_extent_idx); 1871 s += sizeof(struct ext4_extent_header); 1872 1873 path[1].p_maxdepth = path[0].p_maxdepth; 1874 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1875 path[0].p_depth = 0; 1876 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1877 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); 1878 path[0].p_hdr->eh_max = cpu_to_le16(max_root); 1879 1880 brelse(path[1].p_bh); 1881 ext4_free_blocks(handle, inode, NULL, blk, 1, 1882 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1883} 1884 1885/* 1886 * This function tries to merge the @ex extent to neighbours in the tree, then 1887 * tries to collapse the extent tree into the inode. 1888 */ 1889static void ext4_ext_try_to_merge(handle_t *handle, 1890 struct inode *inode, 1891 struct ext4_ext_path *path, 1892 struct ext4_extent *ex) 1893{ 1894 struct ext4_extent_header *eh; 1895 unsigned int depth; 1896 int merge_done = 0; 1897 1898 depth = ext_depth(inode); 1899 BUG_ON(path[depth].p_hdr == NULL); 1900 eh = path[depth].p_hdr; 1901 1902 if (ex > EXT_FIRST_EXTENT(eh)) 1903 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1904 1905 if (!merge_done) 1906 (void) ext4_ext_try_to_merge_right(inode, path, ex); 1907 1908 ext4_ext_try_to_merge_up(handle, inode, path); 1909} 1910 1911/* 1912 * check if a portion of the "newext" extent overlaps with an 1913 * existing extent. 1914 * 1915 * If there is an overlap discovered, it updates the length of the newext 1916 * such that there will be no overlap, and then returns 1. 1917 * If there is no overlap found, it returns 0. 1918 */ 1919static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, 1920 struct inode *inode, 1921 struct ext4_extent *newext, 1922 struct ext4_ext_path *path) 1923{ 1924 ext4_lblk_t b1, b2; 1925 unsigned int depth, len1; 1926 unsigned int ret = 0; 1927 1928 b1 = le32_to_cpu(newext->ee_block); 1929 len1 = ext4_ext_get_actual_len(newext); 1930 depth = ext_depth(inode); 1931 if (!path[depth].p_ext) 1932 goto out; 1933 b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); 1934 1935 /* 1936 * get the next allocated block if the extent in the path 1937 * is before the requested block(s) 1938 */ 1939 if (b2 < b1) { 1940 b2 = ext4_ext_next_allocated_block(path); 1941 if (b2 == EXT_MAX_BLOCKS) 1942 goto out; 1943 b2 = EXT4_LBLK_CMASK(sbi, b2); 1944 } 1945 1946 /* check for wrap through zero on extent logical start block*/ 1947 if (b1 + len1 < b1) { 1948 len1 = EXT_MAX_BLOCKS - b1; 1949 newext->ee_len = cpu_to_le16(len1); 1950 ret = 1; 1951 } 1952 1953 /* check for overlap */ 1954 if (b1 + len1 > b2) { 1955 newext->ee_len = cpu_to_le16(b2 - b1); 1956 ret = 1; 1957 } 1958out: 1959 return ret; 1960} 1961 1962/* 1963 * ext4_ext_insert_extent: 1964 * tries to merge requested extent into the existing extent or 1965 * inserts requested extent as new one into the tree, 1966 * creating new leaf in the no-space case. 1967 */ 1968int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1969 struct ext4_ext_path **ppath, 1970 struct ext4_extent *newext, int gb_flags) 1971{ 1972 struct ext4_ext_path *path = *ppath; 1973 struct ext4_extent_header *eh; 1974 struct ext4_extent *ex, *fex; 1975 struct ext4_extent *nearex; /* nearest extent */ 1976 struct ext4_ext_path *npath = NULL; 1977 int depth, len, err; 1978 ext4_lblk_t next; 1979 int mb_flags = 0, unwritten; 1980 1981 if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1982 mb_flags |= EXT4_MB_DELALLOC_RESERVED; 1983 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1984 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1985 return -EFSCORRUPTED; 1986 } 1987 depth = ext_depth(inode); 1988 ex = path[depth].p_ext; 1989 eh = path[depth].p_hdr; 1990 if (unlikely(path[depth].p_hdr == NULL)) { 1991 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1992 return -EFSCORRUPTED; 1993 } 1994 1995 /* try to insert block into found extent and return */ 1996 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { 1997 1998 /* 1999 * Try to see whether we should rather test the extent on 2000 * right from ex, or from the left of ex. This is because 2001 * ext4_find_extent() can return either extent on the 2002 * left, or on the right from the searched position. This 2003 * will make merging more effective. 2004 */ 2005 if (ex < EXT_LAST_EXTENT(eh) && 2006 (le32_to_cpu(ex->ee_block) + 2007 ext4_ext_get_actual_len(ex) < 2008 le32_to_cpu(newext->ee_block))) { 2009 ex += 1; 2010 goto prepend; 2011 } else if ((ex > EXT_FIRST_EXTENT(eh)) && 2012 (le32_to_cpu(newext->ee_block) + 2013 ext4_ext_get_actual_len(newext) < 2014 le32_to_cpu(ex->ee_block))) 2015 ex -= 1; 2016 2017 /* Try to append newex to the ex */ 2018 if (ext4_can_extents_be_merged(inode, ex, newext)) { 2019 ext_debug(inode, "append [%d]%d block to %u:[%d]%d" 2020 "(from %llu)\n", 2021 ext4_ext_is_unwritten(newext), 2022 ext4_ext_get_actual_len(newext), 2023 le32_to_cpu(ex->ee_block), 2024 ext4_ext_is_unwritten(ex), 2025 ext4_ext_get_actual_len(ex), 2026 ext4_ext_pblock(ex)); 2027 err = ext4_ext_get_access(handle, inode, 2028 path + depth); 2029 if (err) 2030 return err; 2031 unwritten = ext4_ext_is_unwritten(ex); 2032 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 2033 + ext4_ext_get_actual_len(newext)); 2034 if (unwritten) 2035 ext4_ext_mark_unwritten(ex); 2036 eh = path[depth].p_hdr; 2037 nearex = ex; 2038 goto merge; 2039 } 2040 2041prepend: 2042 /* Try to prepend newex to the ex */ 2043 if (ext4_can_extents_be_merged(inode, newext, ex)) { 2044 ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" 2045 "(from %llu)\n", 2046 le32_to_cpu(newext->ee_block), 2047 ext4_ext_is_unwritten(newext), 2048 ext4_ext_get_actual_len(newext), 2049 le32_to_cpu(ex->ee_block), 2050 ext4_ext_is_unwritten(ex), 2051 ext4_ext_get_actual_len(ex), 2052 ext4_ext_pblock(ex)); 2053 err = ext4_ext_get_access(handle, inode, 2054 path + depth); 2055 if (err) 2056 return err; 2057 2058 unwritten = ext4_ext_is_unwritten(ex); 2059 ex->ee_block = newext->ee_block; 2060 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 2061 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 2062 + ext4_ext_get_actual_len(newext)); 2063 if (unwritten) 2064 ext4_ext_mark_unwritten(ex); 2065 eh = path[depth].p_hdr; 2066 nearex = ex; 2067 goto merge; 2068 } 2069 } 2070 2071 depth = ext_depth(inode); 2072 eh = path[depth].p_hdr; 2073 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 2074 goto has_space; 2075 2076 /* probably next leaf has space for us? */ 2077 fex = EXT_LAST_EXTENT(eh); 2078 next = EXT_MAX_BLOCKS; 2079 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 2080 next = ext4_ext_next_leaf_block(path); 2081 if (next != EXT_MAX_BLOCKS) { 2082 ext_debug(inode, "next leaf block - %u\n", next); 2083 BUG_ON(npath != NULL); 2084 npath = ext4_find_extent(inode, next, NULL, gb_flags); 2085 if (IS_ERR(npath)) 2086 return PTR_ERR(npath); 2087 BUG_ON(npath->p_depth != path->p_depth); 2088 eh = npath[depth].p_hdr; 2089 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 2090 ext_debug(inode, "next leaf isn't full(%d)\n", 2091 le16_to_cpu(eh->eh_entries)); 2092 path = npath; 2093 goto has_space; 2094 } 2095 ext_debug(inode, "next leaf has no free space(%d,%d)\n", 2096 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 2097 } 2098 2099 /* 2100 * There is no free space in the found leaf. 2101 * We're gonna add a new leaf in the tree. 2102 */ 2103 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2104 mb_flags |= EXT4_MB_USE_RESERVED; 2105 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 2106 ppath, newext); 2107 if (err) 2108 goto cleanup; 2109 depth = ext_depth(inode); 2110 eh = path[depth].p_hdr; 2111 2112has_space: 2113 nearex = path[depth].p_ext; 2114 2115 err = ext4_ext_get_access(handle, inode, path + depth); 2116 if (err) 2117 goto cleanup; 2118 2119 if (!nearex) { 2120 /* there is no extent in this leaf, create first one */ 2121 ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", 2122 le32_to_cpu(newext->ee_block), 2123 ext4_ext_pblock(newext), 2124 ext4_ext_is_unwritten(newext), 2125 ext4_ext_get_actual_len(newext)); 2126 nearex = EXT_FIRST_EXTENT(eh); 2127 } else { 2128 if (le32_to_cpu(newext->ee_block) 2129 > le32_to_cpu(nearex->ee_block)) { 2130 /* Insert after */ 2131 ext_debug(inode, "insert %u:%llu:[%d]%d before: " 2132 "nearest %p\n", 2133 le32_to_cpu(newext->ee_block), 2134 ext4_ext_pblock(newext), 2135 ext4_ext_is_unwritten(newext), 2136 ext4_ext_get_actual_len(newext), 2137 nearex); 2138 nearex++; 2139 } else { 2140 /* Insert before */ 2141 BUG_ON(newext->ee_block == nearex->ee_block); 2142 ext_debug(inode, "insert %u:%llu:[%d]%d after: " 2143 "nearest %p\n", 2144 le32_to_cpu(newext->ee_block), 2145 ext4_ext_pblock(newext), 2146 ext4_ext_is_unwritten(newext), 2147 ext4_ext_get_actual_len(newext), 2148 nearex); 2149 } 2150 len = EXT_LAST_EXTENT(eh) - nearex + 1; 2151 if (len > 0) { 2152 ext_debug(inode, "insert %u:%llu:[%d]%d: " 2153 "move %d extents from 0x%p to 0x%p\n", 2154 le32_to_cpu(newext->ee_block), 2155 ext4_ext_pblock(newext), 2156 ext4_ext_is_unwritten(newext), 2157 ext4_ext_get_actual_len(newext), 2158 len, nearex, nearex + 1); 2159 memmove(nearex + 1, nearex, 2160 len * sizeof(struct ext4_extent)); 2161 } 2162 } 2163 2164 le16_add_cpu(&eh->eh_entries, 1); 2165 path[depth].p_ext = nearex; 2166 nearex->ee_block = newext->ee_block; 2167 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 2168 nearex->ee_len = newext->ee_len; 2169 2170merge: 2171 /* try to merge extents */ 2172 if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) 2173 ext4_ext_try_to_merge(handle, inode, path, nearex); 2174 2175 2176 /* time to correct all indexes above */ 2177 err = ext4_ext_correct_indexes(handle, inode, path); 2178 if (err) 2179 goto cleanup; 2180 2181 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2182 2183cleanup: 2184 ext4_ext_drop_refs(npath); 2185 kfree(npath); 2186 return err; 2187} 2188 2189static int ext4_fill_es_cache_info(struct inode *inode, 2190 ext4_lblk_t block, ext4_lblk_t num, 2191 struct fiemap_extent_info *fieinfo) 2192{ 2193 ext4_lblk_t next, end = block + num - 1; 2194 struct extent_status es; 2195 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 2196 unsigned int flags; 2197 int err; 2198 2199 while (block <= end) { 2200 next = 0; 2201 flags = 0; 2202 if (!ext4_es_lookup_extent(inode, block, &next, &es)) 2203 break; 2204 if (ext4_es_is_unwritten(&es)) 2205 flags |= FIEMAP_EXTENT_UNWRITTEN; 2206 if (ext4_es_is_delayed(&es)) 2207 flags |= (FIEMAP_EXTENT_DELALLOC | 2208 FIEMAP_EXTENT_UNKNOWN); 2209 if (ext4_es_is_hole(&es)) 2210 flags |= EXT4_FIEMAP_EXTENT_HOLE; 2211 if (next == 0) 2212 flags |= FIEMAP_EXTENT_LAST; 2213 if (flags & (FIEMAP_EXTENT_DELALLOC| 2214 EXT4_FIEMAP_EXTENT_HOLE)) 2215 es.es_pblk = 0; 2216 else 2217 es.es_pblk = ext4_es_pblock(&es); 2218 err = fiemap_fill_next_extent(fieinfo, 2219 (__u64)es.es_lblk << blksize_bits, 2220 (__u64)es.es_pblk << blksize_bits, 2221 (__u64)es.es_len << blksize_bits, 2222 flags); 2223 if (next == 0) 2224 break; 2225 block = next; 2226 if (err < 0) 2227 return err; 2228 if (err == 1) 2229 return 0; 2230 } 2231 return 0; 2232} 2233 2234 2235/* 2236 * ext4_ext_determine_hole - determine hole around given block 2237 * @inode: inode we lookup in 2238 * @path: path in extent tree to @lblk 2239 * @lblk: pointer to logical block around which we want to determine hole 2240 * 2241 * Determine hole length (and start if easily possible) around given logical 2242 * block. We don't try too hard to find the beginning of the hole but @path 2243 * actually points to extent before @lblk, we provide it. 2244 * 2245 * The function returns the length of a hole starting at @lblk. We update @lblk 2246 * to the beginning of the hole if we managed to find it. 2247 */ 2248static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, 2249 struct ext4_ext_path *path, 2250 ext4_lblk_t *lblk) 2251{ 2252 int depth = ext_depth(inode); 2253 struct ext4_extent *ex; 2254 ext4_lblk_t len; 2255 2256 ex = path[depth].p_ext; 2257 if (ex == NULL) { 2258 /* there is no extent yet, so gap is [0;-] */ 2259 *lblk = 0; 2260 len = EXT_MAX_BLOCKS; 2261 } else if (*lblk < le32_to_cpu(ex->ee_block)) { 2262 len = le32_to_cpu(ex->ee_block) - *lblk; 2263 } else if (*lblk >= le32_to_cpu(ex->ee_block) 2264 + ext4_ext_get_actual_len(ex)) { 2265 ext4_lblk_t next; 2266 2267 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); 2268 next = ext4_ext_next_allocated_block(path); 2269 BUG_ON(next == *lblk); 2270 len = next - *lblk; 2271 } else { 2272 BUG(); 2273 } 2274 return len; 2275} 2276 2277/* 2278 * ext4_ext_put_gap_in_cache: 2279 * calculate boundaries of the gap that the requested block fits into 2280 * and cache this gap 2281 */ 2282static void 2283ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, 2284 ext4_lblk_t hole_len) 2285{ 2286 struct extent_status es; 2287 2288 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, 2289 hole_start + hole_len - 1, &es); 2290 if (es.es_len) { 2291 /* There's delayed extent containing lblock? */ 2292 if (es.es_lblk <= hole_start) 2293 return; 2294 hole_len = min(es.es_lblk - hole_start, hole_len); 2295 } 2296 ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); 2297 ext4_es_insert_extent(inode, hole_start, hole_len, ~0, 2298 EXTENT_STATUS_HOLE); 2299} 2300 2301/* 2302 * ext4_ext_rm_idx: 2303 * removes index from the index block. 2304 */ 2305static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2306 struct ext4_ext_path *path, int depth) 2307{ 2308 int err; 2309 ext4_fsblk_t leaf; 2310 2311 /* free index block */ 2312 depth--; 2313 path = path + depth; 2314 leaf = ext4_idx_pblock(path->p_idx); 2315 if (unlikely(path->p_hdr->eh_entries == 0)) { 2316 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2317 return -EFSCORRUPTED; 2318 } 2319 err = ext4_ext_get_access(handle, inode, path); 2320 if (err) 2321 return err; 2322 2323 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { 2324 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; 2325 len *= sizeof(struct ext4_extent_idx); 2326 memmove(path->p_idx, path->p_idx + 1, len); 2327 } 2328 2329 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2330 err = ext4_ext_dirty(handle, inode, path); 2331 if (err) 2332 return err; 2333 ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); 2334 trace_ext4_ext_rm_idx(inode, leaf); 2335 2336 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2337 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2338 2339 while (--depth >= 0) { 2340 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) 2341 break; 2342 path--; 2343 err = ext4_ext_get_access(handle, inode, path); 2344 if (err) 2345 break; 2346 path->p_idx->ei_block = (path+1)->p_idx->ei_block; 2347 err = ext4_ext_dirty(handle, inode, path); 2348 if (err) 2349 break; 2350 } 2351 return err; 2352} 2353 2354/* 2355 * ext4_ext_calc_credits_for_single_extent: 2356 * This routine returns max. credits that needed to insert an extent 2357 * to the extent tree. 2358 * When pass the actual path, the caller should calculate credits 2359 * under i_data_sem. 2360 */ 2361int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2362 struct ext4_ext_path *path) 2363{ 2364 if (path) { 2365 int depth = ext_depth(inode); 2366 int ret = 0; 2367 2368 /* probably there is space in leaf? */ 2369 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2370 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2371 2372 /* 2373 * There are some space in the leaf tree, no 2374 * need to account for leaf block credit 2375 * 2376 * bitmaps and block group descriptor blocks 2377 * and other metadata blocks still need to be 2378 * accounted. 2379 */ 2380 /* 1 bitmap, 1 block group descriptor */ 2381 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2382 return ret; 2383 } 2384 } 2385 2386 return ext4_chunk_trans_blocks(inode, nrblocks); 2387} 2388 2389/* 2390 * How many index/leaf blocks need to change/allocate to add @extents extents? 2391 * 2392 * If we add a single extent, then in the worse case, each tree level 2393 * index/leaf need to be changed in case of the tree split. 2394 * 2395 * If more extents are inserted, they could cause the whole tree split more 2396 * than once, but this is really rare. 2397 */ 2398int ext4_ext_index_trans_blocks(struct inode *inode, int extents) 2399{ 2400 int index; 2401 int depth; 2402 2403 /* If we are converting the inline data, only one is needed here. */ 2404 if (ext4_has_inline_data(inode)) 2405 return 1; 2406 2407 depth = ext_depth(inode); 2408 2409 if (extents <= 1) 2410 index = depth * 2; 2411 else 2412 index = depth * 3; 2413 2414 return index; 2415} 2416 2417static inline int get_default_free_blocks_flags(struct inode *inode) 2418{ 2419 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || 2420 ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) 2421 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2422 else if (ext4_should_journal_data(inode)) 2423 return EXT4_FREE_BLOCKS_FORGET; 2424 return 0; 2425} 2426 2427/* 2428 * ext4_rereserve_cluster - increment the reserved cluster count when 2429 * freeing a cluster with a pending reservation 2430 * 2431 * @inode - file containing the cluster 2432 * @lblk - logical block in cluster to be reserved 2433 * 2434 * Increments the reserved cluster count and adjusts quota in a bigalloc 2435 * file system when freeing a partial cluster containing at least one 2436 * delayed and unwritten block. A partial cluster meeting that 2437 * requirement will have a pending reservation. If so, the 2438 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to 2439 * defer reserved and allocated space accounting to a subsequent call 2440 * to this function. 2441 */ 2442static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) 2443{ 2444 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2445 struct ext4_inode_info *ei = EXT4_I(inode); 2446 2447 dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); 2448 2449 spin_lock(&ei->i_block_reservation_lock); 2450 ei->i_reserved_data_blocks++; 2451 percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); 2452 spin_unlock(&ei->i_block_reservation_lock); 2453 2454 percpu_counter_add(&sbi->s_freeclusters_counter, 1); 2455 ext4_remove_pending(inode, lblk); 2456} 2457 2458static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2459 struct ext4_extent *ex, 2460 struct partial_cluster *partial, 2461 ext4_lblk_t from, ext4_lblk_t to) 2462{ 2463 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2464 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2465 ext4_fsblk_t last_pblk, pblk; 2466 ext4_lblk_t num; 2467 int flags; 2468 2469 /* only extent tail removal is allowed */ 2470 if (from < le32_to_cpu(ex->ee_block) || 2471 to != le32_to_cpu(ex->ee_block) + ee_len - 1) { 2472 ext4_error(sbi->s_sb, 2473 "strange request: removal(2) %u-%u from %u:%u", 2474 from, to, le32_to_cpu(ex->ee_block), ee_len); 2475 return 0; 2476 } 2477 2478#ifdef EXTENTS_STATS 2479 spin_lock(&sbi->s_ext_stats_lock); 2480 sbi->s_ext_blocks += ee_len; 2481 sbi->s_ext_extents++; 2482 if (ee_len < sbi->s_ext_min) 2483 sbi->s_ext_min = ee_len; 2484 if (ee_len > sbi->s_ext_max) 2485 sbi->s_ext_max = ee_len; 2486 if (ext_depth(inode) > sbi->s_depth_max) 2487 sbi->s_depth_max = ext_depth(inode); 2488 spin_unlock(&sbi->s_ext_stats_lock); 2489#endif 2490 2491 trace_ext4_remove_blocks(inode, ex, from, to, partial); 2492 2493 /* 2494 * if we have a partial cluster, and it's different from the 2495 * cluster of the last block in the extent, we free it 2496 */ 2497 last_pblk = ext4_ext_pblock(ex) + ee_len - 1; 2498 2499 if (partial->state != initial && 2500 partial->pclu != EXT4_B2C(sbi, last_pblk)) { 2501 if (partial->state == tofree) { 2502 flags = get_default_free_blocks_flags(inode); 2503 if (ext4_is_pending(inode, partial->lblk)) 2504 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; 2505 ext4_free_blocks(handle, inode, NULL, 2506 EXT4_C2B(sbi, partial->pclu), 2507 sbi->s_cluster_ratio, flags); 2508 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) 2509 ext4_rereserve_cluster(inode, partial->lblk); 2510 } 2511 partial->state = initial; 2512 } 2513 2514 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2515 pblk = ext4_ext_pblock(ex) + ee_len - num; 2516 2517 /* 2518 * We free the partial cluster at the end of the extent (if any), 2519 * unless the cluster is used by another extent (partial_cluster 2520 * state is nofree). If a partial cluster exists here, it must be 2521 * shared with the last block in the extent. 2522 */ 2523 flags = get_default_free_blocks_flags(inode); 2524 2525 /* partial, left end cluster aligned, right end unaligned */ 2526 if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && 2527 (EXT4_LBLK_CMASK(sbi, to) >= from) && 2528 (partial->state != nofree)) { 2529 if (ext4_is_pending(inode, to)) 2530 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; 2531 ext4_free_blocks(handle, inode, NULL, 2532 EXT4_PBLK_CMASK(sbi, last_pblk), 2533 sbi->s_cluster_ratio, flags); 2534 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) 2535 ext4_rereserve_cluster(inode, to); 2536 partial->state = initial; 2537 flags = get_default_free_blocks_flags(inode); 2538 } 2539 2540 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2541 2542 /* 2543 * For bigalloc file systems, we never free a partial cluster 2544 * at the beginning of the extent. Instead, we check to see if we 2545 * need to free it on a subsequent call to ext4_remove_blocks, 2546 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. 2547 */ 2548 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2549 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2550 2551 /* reset the partial cluster if we've freed past it */ 2552 if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) 2553 partial->state = initial; 2554 2555 /* 2556 * If we've freed the entire extent but the beginning is not left 2557 * cluster aligned and is not marked as ineligible for freeing we 2558 * record the partial cluster at the beginning of the extent. It 2559 * wasn't freed by the preceding ext4_free_blocks() call, and we 2560 * need to look farther to the left to determine if it's to be freed 2561 * (not shared with another extent). Else, reset the partial 2562 * cluster - we're either done freeing or the beginning of the 2563 * extent is left cluster aligned. 2564 */ 2565 if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { 2566 if (partial->state == initial) { 2567 partial->pclu = EXT4_B2C(sbi, pblk); 2568 partial->lblk = from; 2569 partial->state = tofree; 2570 } 2571 } else { 2572 partial->state = initial; 2573 } 2574 2575 return 0; 2576} 2577 2578/* 2579 * ext4_ext_rm_leaf() Removes the extents associated with the 2580 * blocks appearing between "start" and "end". Both "start" 2581 * and "end" must appear in the same extent or EIO is returned. 2582 * 2583 * @handle: The journal handle 2584 * @inode: The files inode 2585 * @path: The path to the leaf 2586 * @partial_cluster: The cluster which we'll have to free if all extents 2587 * has been released from it. However, if this value is 2588 * negative, it's a cluster just to the right of the 2589 * punched region and it must not be freed. 2590 * @start: The first block to remove 2591 * @end: The last block to remove 2592 */ 2593static int 2594ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2595 struct ext4_ext_path *path, 2596 struct partial_cluster *partial, 2597 ext4_lblk_t start, ext4_lblk_t end) 2598{ 2599 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2600 int err = 0, correct_index = 0; 2601 int depth = ext_depth(inode), credits, revoke_credits; 2602 struct ext4_extent_header *eh; 2603 ext4_lblk_t a, b; 2604 unsigned num; 2605 ext4_lblk_t ex_ee_block; 2606 unsigned short ex_ee_len; 2607 unsigned unwritten = 0; 2608 struct ext4_extent *ex; 2609 ext4_fsblk_t pblk; 2610 2611 /* the header must be checked already in ext4_ext_remove_space() */ 2612 ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); 2613 if (!path[depth].p_hdr) 2614 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2615 eh = path[depth].p_hdr; 2616 if (unlikely(path[depth].p_hdr == NULL)) { 2617 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2618 return -EFSCORRUPTED; 2619 } 2620 /* find where to start removing */ 2621 ex = path[depth].p_ext; 2622 if (!ex) 2623 ex = EXT_LAST_EXTENT(eh); 2624 2625 ex_ee_block = le32_to_cpu(ex->ee_block); 2626 ex_ee_len = ext4_ext_get_actual_len(ex); 2627 2628 trace_ext4_ext_rm_leaf(inode, start, ex, partial); 2629 2630 while (ex >= EXT_FIRST_EXTENT(eh) && 2631 ex_ee_block + ex_ee_len > start) { 2632 2633 if (ext4_ext_is_unwritten(ex)) 2634 unwritten = 1; 2635 else 2636 unwritten = 0; 2637 2638 ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, 2639 unwritten, ex_ee_len); 2640 path[depth].p_ext = ex; 2641 2642 a = ex_ee_block > start ? ex_ee_block : start; 2643 b = ex_ee_block+ex_ee_len - 1 < end ? 2644 ex_ee_block+ex_ee_len - 1 : end; 2645 2646 ext_debug(inode, " border %u:%u\n", a, b); 2647 2648 /* If this extent is beyond the end of the hole, skip it */ 2649 if (end < ex_ee_block) { 2650 /* 2651 * We're going to skip this extent and move to another, 2652 * so note that its first cluster is in use to avoid 2653 * freeing it when removing blocks. Eventually, the 2654 * right edge of the truncated/punched region will 2655 * be just to the left. 2656 */ 2657 if (sbi->s_cluster_ratio > 1) { 2658 pblk = ext4_ext_pblock(ex); 2659 partial->pclu = EXT4_B2C(sbi, pblk); 2660 partial->state = nofree; 2661 } 2662 ex--; 2663 ex_ee_block = le32_to_cpu(ex->ee_block); 2664 ex_ee_len = ext4_ext_get_actual_len(ex); 2665 continue; 2666 } else if (b != ex_ee_block + ex_ee_len - 1) { 2667 EXT4_ERROR_INODE(inode, 2668 "can not handle truncate %u:%u " 2669 "on extent %u:%u", 2670 start, end, ex_ee_block, 2671 ex_ee_block + ex_ee_len - 1); 2672 err = -EFSCORRUPTED; 2673 goto out; 2674 } else if (a != ex_ee_block) { 2675 /* remove tail of the extent */ 2676 num = a - ex_ee_block; 2677 } else { 2678 /* remove whole extent: excellent! */ 2679 num = 0; 2680 } 2681 /* 2682 * 3 for leaf, sb, and inode plus 2 (bmap and group 2683 * descriptor) for each block group; assume two block 2684 * groups plus ex_ee_len/blocks_per_block_group for 2685 * the worst case 2686 */ 2687 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2688 if (ex == EXT_FIRST_EXTENT(eh)) { 2689 correct_index = 1; 2690 credits += (ext_depth(inode)) + 1; 2691 } 2692 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2693 /* 2694 * We may end up freeing some index blocks and data from the 2695 * punched range. Note that partial clusters are accounted for 2696 * by ext4_free_data_revoke_credits(). 2697 */ 2698 revoke_credits = 2699 ext4_free_metadata_revoke_credits(inode->i_sb, 2700 ext_depth(inode)) + 2701 ext4_free_data_revoke_credits(inode, b - a + 1); 2702 2703 err = ext4_datasem_ensure_credits(handle, inode, credits, 2704 credits, revoke_credits); 2705 if (err) { 2706 if (err > 0) 2707 err = -EAGAIN; 2708 goto out; 2709 } 2710 2711 err = ext4_ext_get_access(handle, inode, path + depth); 2712 if (err) 2713 goto out; 2714 2715 err = ext4_remove_blocks(handle, inode, ex, partial, a, b); 2716 if (err) 2717 goto out; 2718 2719 if (num == 0) 2720 /* this extent is removed; mark slot entirely unused */ 2721 ext4_ext_store_pblock(ex, 0); 2722 2723 ex->ee_len = cpu_to_le16(num); 2724 /* 2725 * Do not mark unwritten if all the blocks in the 2726 * extent have been removed. 2727 */ 2728 if (unwritten && num) 2729 ext4_ext_mark_unwritten(ex); 2730 /* 2731 * If the extent was completely released, 2732 * we need to remove it from the leaf 2733 */ 2734 if (num == 0) { 2735 if (end != EXT_MAX_BLOCKS - 1) { 2736 /* 2737 * For hole punching, we need to scoot all the 2738 * extents up when an extent is removed so that 2739 * we dont have blank extents in the middle 2740 */ 2741 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * 2742 sizeof(struct ext4_extent)); 2743 2744 /* Now get rid of the one at the end */ 2745 memset(EXT_LAST_EXTENT(eh), 0, 2746 sizeof(struct ext4_extent)); 2747 } 2748 le16_add_cpu(&eh->eh_entries, -1); 2749 } 2750 2751 err = ext4_ext_dirty(handle, inode, path + depth); 2752 if (err) 2753 goto out; 2754 2755 ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, 2756 ext4_ext_pblock(ex)); 2757 ex--; 2758 ex_ee_block = le32_to_cpu(ex->ee_block); 2759 ex_ee_len = ext4_ext_get_actual_len(ex); 2760 } 2761 2762 if (correct_index && eh->eh_entries) 2763 err = ext4_ext_correct_indexes(handle, inode, path); 2764 2765 /* 2766 * If there's a partial cluster and at least one extent remains in 2767 * the leaf, free the partial cluster if it isn't shared with the 2768 * current extent. If it is shared with the current extent 2769 * we reset the partial cluster because we've reached the start of the 2770 * truncated/punched region and we're done removing blocks. 2771 */ 2772 if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { 2773 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; 2774 if (partial->pclu != EXT4_B2C(sbi, pblk)) { 2775 int flags = get_default_free_blocks_flags(inode); 2776 2777 if (ext4_is_pending(inode, partial->lblk)) 2778 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; 2779 ext4_free_blocks(handle, inode, NULL, 2780 EXT4_C2B(sbi, partial->pclu), 2781 sbi->s_cluster_ratio, flags); 2782 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) 2783 ext4_rereserve_cluster(inode, partial->lblk); 2784 } 2785 partial->state = initial; 2786 } 2787 2788 /* if this leaf is free, then we should 2789 * remove it from index block above */ 2790 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2791 err = ext4_ext_rm_idx(handle, inode, path, depth); 2792 2793out: 2794 return err; 2795} 2796 2797/* 2798 * ext4_ext_more_to_rm: 2799 * returns 1 if current index has to be freed (even partial) 2800 */ 2801static int 2802ext4_ext_more_to_rm(struct ext4_ext_path *path) 2803{ 2804 BUG_ON(path->p_idx == NULL); 2805 2806 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2807 return 0; 2808 2809 /* 2810 * if truncate on deeper level happened, it wasn't partial, 2811 * so we have to consider current index for truncation 2812 */ 2813 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2814 return 0; 2815 return 1; 2816} 2817 2818int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2819 ext4_lblk_t end) 2820{ 2821 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2822 int depth = ext_depth(inode); 2823 struct ext4_ext_path *path = NULL; 2824 struct partial_cluster partial; 2825 handle_t *handle; 2826 int i = 0, err = 0; 2827 2828 partial.pclu = 0; 2829 partial.lblk = 0; 2830 partial.state = initial; 2831 2832 ext_debug(inode, "truncate since %u to %u\n", start, end); 2833 2834 /* probably first extent we're gonna free will be last in block */ 2835 handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, 2836 depth + 1, 2837 ext4_free_metadata_revoke_credits(inode->i_sb, depth)); 2838 if (IS_ERR(handle)) 2839 return PTR_ERR(handle); 2840 2841again: 2842 trace_ext4_ext_remove_space(inode, start, end, depth); 2843 2844 /* 2845 * Check if we are removing extents inside the extent tree. If that 2846 * is the case, we are going to punch a hole inside the extent tree 2847 * so we have to check whether we need to split the extent covering 2848 * the last block to remove so we can easily remove the part of it 2849 * in ext4_ext_rm_leaf(). 2850 */ 2851 if (end < EXT_MAX_BLOCKS - 1) { 2852 struct ext4_extent *ex; 2853 ext4_lblk_t ee_block, ex_end, lblk; 2854 ext4_fsblk_t pblk; 2855 2856 /* find extent for or closest extent to this block */ 2857 path = ext4_find_extent(inode, end, NULL, 2858 EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); 2859 if (IS_ERR(path)) { 2860 ext4_journal_stop(handle); 2861 return PTR_ERR(path); 2862 } 2863 depth = ext_depth(inode); 2864 /* Leaf not may not exist only if inode has no blocks at all */ 2865 ex = path[depth].p_ext; 2866 if (!ex) { 2867 if (depth) { 2868 EXT4_ERROR_INODE(inode, 2869 "path[%d].p_hdr == NULL", 2870 depth); 2871 err = -EFSCORRUPTED; 2872 } 2873 goto out; 2874 } 2875 2876 ee_block = le32_to_cpu(ex->ee_block); 2877 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; 2878 2879 /* 2880 * See if the last block is inside the extent, if so split 2881 * the extent at 'end' block so we can easily remove the 2882 * tail of the first part of the split extent in 2883 * ext4_ext_rm_leaf(). 2884 */ 2885 if (end >= ee_block && end < ex_end) { 2886 2887 /* 2888 * If we're going to split the extent, note that 2889 * the cluster containing the block after 'end' is 2890 * in use to avoid freeing it when removing blocks. 2891 */ 2892 if (sbi->s_cluster_ratio > 1) { 2893 pblk = ext4_ext_pblock(ex) + end - ee_block + 1; 2894 partial.pclu = EXT4_B2C(sbi, pblk); 2895 partial.state = nofree; 2896 } 2897 2898 /* 2899 * Split the extent in two so that 'end' is the last 2900 * block in the first new extent. Also we should not 2901 * fail removing space due to ENOSPC so try to use 2902 * reserved block if that happens. 2903 */ 2904 err = ext4_force_split_extent_at(handle, inode, &path, 2905 end + 1, 1); 2906 if (err < 0) 2907 goto out; 2908 2909 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && 2910 partial.state == initial) { 2911 /* 2912 * If we're punching, there's an extent to the right. 2913 * If the partial cluster hasn't been set, set it to 2914 * that extent's first cluster and its state to nofree 2915 * so it won't be freed should it contain blocks to be 2916 * removed. If it's already set (tofree/nofree), we're 2917 * retrying and keep the original partial cluster info 2918 * so a cluster marked tofree as a result of earlier 2919 * extent removal is not lost. 2920 */ 2921 lblk = ex_end + 1; 2922 err = ext4_ext_search_right(inode, path, &lblk, &pblk, 2923 NULL); 2924 if (err < 0) 2925 goto out; 2926 if (pblk) { 2927 partial.pclu = EXT4_B2C(sbi, pblk); 2928 partial.state = nofree; 2929 } 2930 } 2931 } 2932 /* 2933 * We start scanning from right side, freeing all the blocks 2934 * after i_size and walking into the tree depth-wise. 2935 */ 2936 depth = ext_depth(inode); 2937 if (path) { 2938 int k = i = depth; 2939 while (--k > 0) 2940 path[k].p_block = 2941 le16_to_cpu(path[k].p_hdr->eh_entries)+1; 2942 } else { 2943 path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), 2944 GFP_NOFS | __GFP_NOFAIL); 2945 if (path == NULL) { 2946 ext4_journal_stop(handle); 2947 return -ENOMEM; 2948 } 2949 path[0].p_maxdepth = path[0].p_depth = depth; 2950 path[0].p_hdr = ext_inode_hdr(inode); 2951 i = 0; 2952 2953 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { 2954 err = -EFSCORRUPTED; 2955 goto out; 2956 } 2957 } 2958 err = 0; 2959 2960 while (i >= 0 && err == 0) { 2961 if (i == depth) { 2962 /* this is leaf block */ 2963 err = ext4_ext_rm_leaf(handle, inode, path, 2964 &partial, start, end); 2965 /* root level has p_bh == NULL, brelse() eats this */ 2966 brelse(path[i].p_bh); 2967 path[i].p_bh = NULL; 2968 i--; 2969 continue; 2970 } 2971 2972 /* this is index block */ 2973 if (!path[i].p_hdr) { 2974 ext_debug(inode, "initialize header\n"); 2975 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2976 } 2977 2978 if (!path[i].p_idx) { 2979 /* this level hasn't been touched yet */ 2980 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2981 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2982 ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", 2983 path[i].p_hdr, 2984 le16_to_cpu(path[i].p_hdr->eh_entries)); 2985 } else { 2986 /* we were already here, see at next index */ 2987 path[i].p_idx--; 2988 } 2989 2990 ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", 2991 i, EXT_FIRST_INDEX(path[i].p_hdr), 2992 path[i].p_idx); 2993 if (ext4_ext_more_to_rm(path + i)) { 2994 struct buffer_head *bh; 2995 /* go to the next level */ 2996 ext_debug(inode, "move to level %d (block %llu)\n", 2997 i + 1, ext4_idx_pblock(path[i].p_idx)); 2998 memset(path + i + 1, 0, sizeof(*path)); 2999 bh = read_extent_tree_block(inode, path[i].p_idx, 3000 depth - i - 1, 3001 EXT4_EX_NOCACHE); 3002 if (IS_ERR(bh)) { 3003 /* should we reset i_size? */ 3004 err = PTR_ERR(bh); 3005 break; 3006 } 3007 /* Yield here to deal with large extent trees. 3008 * Should be a no-op if we did IO above. */ 3009 cond_resched(); 3010 if (WARN_ON(i + 1 > depth)) { 3011 err = -EFSCORRUPTED; 3012 break; 3013 } 3014 path[i + 1].p_bh = bh; 3015 3016 /* save actual number of indexes since this 3017 * number is changed at the next iteration */ 3018 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 3019 i++; 3020 } else { 3021 /* we finished processing this index, go up */ 3022 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 3023 /* index is empty, remove it; 3024 * handle must be already prepared by the 3025 * truncatei_leaf() */ 3026 err = ext4_ext_rm_idx(handle, inode, path, i); 3027 } 3028 /* root level has p_bh == NULL, brelse() eats this */ 3029 brelse(path[i].p_bh); 3030 path[i].p_bh = NULL; 3031 i--; 3032 ext_debug(inode, "return to level %d\n", i); 3033 } 3034 } 3035 3036 trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, 3037 path->p_hdr->eh_entries); 3038 3039 /* 3040 * if there's a partial cluster and we have removed the first extent 3041 * in the file, then we also free the partial cluster, if any 3042 */ 3043 if (partial.state == tofree && err == 0) { 3044 int flags = get_default_free_blocks_flags(inode); 3045 3046 if (ext4_is_pending(inode, partial.lblk)) 3047 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; 3048 ext4_free_blocks(handle, inode, NULL, 3049 EXT4_C2B(sbi, partial.pclu), 3050 sbi->s_cluster_ratio, flags); 3051 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) 3052 ext4_rereserve_cluster(inode, partial.lblk); 3053 partial.state = initial; 3054 } 3055 3056 /* TODO: flexible tree reduction should be here */ 3057 if (path->p_hdr->eh_entries == 0) { 3058 /* 3059 * truncate to zero freed all the tree, 3060 * so we need to correct eh_depth 3061 */ 3062 err = ext4_ext_get_access(handle, inode, path); 3063 if (err == 0) { 3064 ext_inode_hdr(inode)->eh_depth = 0; 3065 ext_inode_hdr(inode)->eh_max = 3066 cpu_to_le16(ext4_ext_space_root(inode, 0)); 3067 err = ext4_ext_dirty(handle, inode, path); 3068 } 3069 } 3070out: 3071 ext4_ext_drop_refs(path); 3072 kfree(path); 3073 path = NULL; 3074 if (err == -EAGAIN) 3075 goto again; 3076 ext4_journal_stop(handle); 3077 3078 return err; 3079} 3080 3081/* 3082 * called at mount time 3083 */ 3084void ext4_ext_init(struct super_block *sb) 3085{ 3086 /* 3087 * possible initialization would be here 3088 */ 3089 3090 if (ext4_has_feature_extents(sb)) { 3091#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 3092 printk(KERN_INFO "EXT4-fs: file extents enabled" 3093#ifdef AGGRESSIVE_TEST 3094 ", aggressive tests" 3095#endif 3096#ifdef CHECK_BINSEARCH 3097 ", check binsearch" 3098#endif 3099#ifdef EXTENTS_STATS 3100 ", stats" 3101#endif 3102 "\n"); 3103#endif 3104#ifdef EXTENTS_STATS 3105 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 3106 EXT4_SB(sb)->s_ext_min = 1 << 30; 3107 EXT4_SB(sb)->s_ext_max = 0; 3108#endif 3109 } 3110} 3111 3112/* 3113 * called at umount time 3114 */ 3115void ext4_ext_release(struct super_block *sb) 3116{ 3117 if (!ext4_has_feature_extents(sb)) 3118 return; 3119 3120#ifdef EXTENTS_STATS 3121 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 3122 struct ext4_sb_info *sbi = EXT4_SB(sb); 3123 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 3124 sbi->s_ext_blocks, sbi->s_ext_extents, 3125 sbi->s_ext_blocks / sbi->s_ext_extents); 3126 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 3127 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 3128 } 3129#endif 3130} 3131 3132static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) 3133{ 3134 ext4_lblk_t ee_block; 3135 ext4_fsblk_t ee_pblock; 3136 unsigned int ee_len; 3137 3138 ee_block = le32_to_cpu(ex->ee_block); 3139 ee_len = ext4_ext_get_actual_len(ex); 3140 ee_pblock = ext4_ext_pblock(ex); 3141 3142 if (ee_len == 0) 3143 return 0; 3144 3145 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, 3146 EXTENT_STATUS_WRITTEN); 3147} 3148 3149/* FIXME!! we need to try to merge to left or right after zero-out */ 3150static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 3151{ 3152 ext4_fsblk_t ee_pblock; 3153 unsigned int ee_len; 3154 3155 ee_len = ext4_ext_get_actual_len(ex); 3156 ee_pblock = ext4_ext_pblock(ex); 3157 return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, 3158 ee_len); 3159} 3160 3161/* 3162 * ext4_split_extent_at() splits an extent at given block. 3163 * 3164 * @handle: the journal handle 3165 * @inode: the file inode 3166 * @path: the path to the extent 3167 * @split: the logical block where the extent is splitted. 3168 * @split_flags: indicates if the extent could be zeroout if split fails, and 3169 * the states(init or unwritten) of new extents. 3170 * @flags: flags used to insert new extent to extent tree. 3171 * 3172 * 3173 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 3174 * of which are determined by split_flag. 3175 * 3176 * There are two cases: 3177 * a> the extent are splitted into two extent. 3178 * b> split is not needed, and just mark the extent. 3179 * 3180 * return 0 on success. 3181 */ 3182static int ext4_split_extent_at(handle_t *handle, 3183 struct inode *inode, 3184 struct ext4_ext_path **ppath, 3185 ext4_lblk_t split, 3186 int split_flag, 3187 int flags) 3188{ 3189 struct ext4_ext_path *path = *ppath; 3190 ext4_fsblk_t newblock; 3191 ext4_lblk_t ee_block; 3192 struct ext4_extent *ex, newex, orig_ex, zero_ex; 3193 struct ext4_extent *ex2 = NULL; 3194 unsigned int ee_len, depth; 3195 int err = 0; 3196 3197 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == 3198 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); 3199 3200 ext_debug(inode, "logical block %llu\n", (unsigned long long)split); 3201 3202 ext4_ext_show_leaf(inode, path); 3203 3204 depth = ext_depth(inode); 3205 ex = path[depth].p_ext; 3206 ee_block = le32_to_cpu(ex->ee_block); 3207 ee_len = ext4_ext_get_actual_len(ex); 3208 newblock = split - ee_block + ext4_ext_pblock(ex); 3209 3210 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 3211 BUG_ON(!ext4_ext_is_unwritten(ex) && 3212 split_flag & (EXT4_EXT_MAY_ZEROOUT | 3213 EXT4_EXT_MARK_UNWRIT1 | 3214 EXT4_EXT_MARK_UNWRIT2)); 3215 3216 err = ext4_ext_get_access(handle, inode, path + depth); 3217 if (err) 3218 goto out; 3219 3220 if (split == ee_block) { 3221 /* 3222 * case b: block @split is the block that the extent begins with 3223 * then we just change the state of the extent, and splitting 3224 * is not needed. 3225 */ 3226 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3227 ext4_ext_mark_unwritten(ex); 3228 else 3229 ext4_ext_mark_initialized(ex); 3230 3231 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 3232 ext4_ext_try_to_merge(handle, inode, path, ex); 3233 3234 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3235 goto out; 3236 } 3237 3238 /* case a */ 3239 memcpy(&orig_ex, ex, sizeof(orig_ex)); 3240 ex->ee_len = cpu_to_le16(split - ee_block); 3241 if (split_flag & EXT4_EXT_MARK_UNWRIT1) 3242 ext4_ext_mark_unwritten(ex); 3243 3244 /* 3245 * path may lead to new leaf, not to original leaf any more 3246 * after ext4_ext_insert_extent() returns, 3247 */ 3248 err = ext4_ext_dirty(handle, inode, path + depth); 3249 if (err) 3250 goto fix_extent_len; 3251 3252 ex2 = &newex; 3253 ex2->ee_block = cpu_to_le32(split); 3254 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 3255 ext4_ext_store_pblock(ex2, newblock); 3256 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3257 ext4_ext_mark_unwritten(ex2); 3258 3259 err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); 3260 if (err != -ENOSPC && err != -EDQUOT) 3261 goto out; 3262 3263 if (EXT4_EXT_MAY_ZEROOUT & split_flag) { 3264 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3265 if (split_flag & EXT4_EXT_DATA_VALID1) { 3266 err = ext4_ext_zeroout(inode, ex2); 3267 zero_ex.ee_block = ex2->ee_block; 3268 zero_ex.ee_len = cpu_to_le16( 3269 ext4_ext_get_actual_len(ex2)); 3270 ext4_ext_store_pblock(&zero_ex, 3271 ext4_ext_pblock(ex2)); 3272 } else { 3273 err = ext4_ext_zeroout(inode, ex); 3274 zero_ex.ee_block = ex->ee_block; 3275 zero_ex.ee_len = cpu_to_le16( 3276 ext4_ext_get_actual_len(ex)); 3277 ext4_ext_store_pblock(&zero_ex, 3278 ext4_ext_pblock(ex)); 3279 } 3280 } else { 3281 err = ext4_ext_zeroout(inode, &orig_ex); 3282 zero_ex.ee_block = orig_ex.ee_block; 3283 zero_ex.ee_len = cpu_to_le16( 3284 ext4_ext_get_actual_len(&orig_ex)); 3285 ext4_ext_store_pblock(&zero_ex, 3286 ext4_ext_pblock(&orig_ex)); 3287 } 3288 3289 if (!err) { 3290 /* update the extent length and mark as initialized */ 3291 ex->ee_len = cpu_to_le16(ee_len); 3292 ext4_ext_try_to_merge(handle, inode, path, ex); 3293 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3294 if (!err) 3295 /* update extent status tree */ 3296 err = ext4_zeroout_es(inode, &zero_ex); 3297 /* If we failed at this point, we don't know in which 3298 * state the extent tree exactly is so don't try to fix 3299 * length of the original extent as it may do even more 3300 * damage. 3301 */ 3302 goto out; 3303 } 3304 } 3305 3306fix_extent_len: 3307 ex->ee_len = orig_ex.ee_len; 3308 /* 3309 * Ignore ext4_ext_dirty return value since we are already in error path 3310 * and err is a non-zero error code. 3311 */ 3312 ext4_ext_dirty(handle, inode, path + path->p_depth); 3313 return err; 3314out: 3315 ext4_ext_show_leaf(inode, path); 3316 return err; 3317} 3318 3319/* 3320 * ext4_split_extents() splits an extent and mark extent which is covered 3321 * by @map as split_flags indicates 3322 * 3323 * It may result in splitting the extent into multiple extents (up to three) 3324 * There are three possibilities: 3325 * a> There is no split required 3326 * b> Splits in two extents: Split is happening at either end of the extent 3327 * c> Splits in three extents: Somone is splitting in middle of the extent 3328 * 3329 */ 3330static int ext4_split_extent(handle_t *handle, 3331 struct inode *inode, 3332 struct ext4_ext_path **ppath, 3333 struct ext4_map_blocks *map, 3334 int split_flag, 3335 int flags) 3336{ 3337 struct ext4_ext_path *path = *ppath; 3338 ext4_lblk_t ee_block; 3339 struct ext4_extent *ex; 3340 unsigned int ee_len, depth; 3341 int err = 0; 3342 int unwritten; 3343 int split_flag1, flags1; 3344 int allocated = map->m_len; 3345 3346 depth = ext_depth(inode); 3347 ex = path[depth].p_ext; 3348 ee_block = le32_to_cpu(ex->ee_block); 3349 ee_len = ext4_ext_get_actual_len(ex); 3350 unwritten = ext4_ext_is_unwritten(ex); 3351 3352 if (map->m_lblk + map->m_len < ee_block + ee_len) { 3353 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; 3354 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 3355 if (unwritten) 3356 split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | 3357 EXT4_EXT_MARK_UNWRIT2; 3358 if (split_flag & EXT4_EXT_DATA_VALID2) 3359 split_flag1 |= EXT4_EXT_DATA_VALID1; 3360 err = ext4_split_extent_at(handle, inode, ppath, 3361 map->m_lblk + map->m_len, split_flag1, flags1); 3362 if (err) 3363 goto out; 3364 } else { 3365 allocated = ee_len - (map->m_lblk - ee_block); 3366 } 3367 /* 3368 * Update path is required because previous ext4_split_extent_at() may 3369 * result in split of original leaf or extent zeroout. 3370 */ 3371 path = ext4_find_extent(inode, map->m_lblk, ppath, flags); 3372 if (IS_ERR(path)) 3373 return PTR_ERR(path); 3374 depth = ext_depth(inode); 3375 ex = path[depth].p_ext; 3376 if (!ex) { 3377 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3378 (unsigned long) map->m_lblk); 3379 return -EFSCORRUPTED; 3380 } 3381 unwritten = ext4_ext_is_unwritten(ex); 3382 split_flag1 = 0; 3383 3384 if (map->m_lblk >= ee_block) { 3385 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; 3386 if (unwritten) { 3387 split_flag1 |= EXT4_EXT_MARK_UNWRIT1; 3388 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3389 EXT4_EXT_MARK_UNWRIT2); 3390 } 3391 err = ext4_split_extent_at(handle, inode, ppath, 3392 map->m_lblk, split_flag1, flags); 3393 if (err) 3394 goto out; 3395 } 3396 3397 ext4_ext_show_leaf(inode, path); 3398out: 3399 return err ? err : allocated; 3400} 3401 3402/* 3403 * This function is called by ext4_ext_map_blocks() if someone tries to write 3404 * to an unwritten extent. It may result in splitting the unwritten 3405 * extent into multiple extents (up to three - one initialized and two 3406 * unwritten). 3407 * There are three possibilities: 3408 * a> There is no split required: Entire extent should be initialized 3409 * b> Splits in two extents: Write is happening at either end of the extent 3410 * c> Splits in three extents: Somone is writing in middle of the extent 3411 * 3412 * Pre-conditions: 3413 * - The extent pointed to by 'path' is unwritten. 3414 * - The extent pointed to by 'path' contains a superset 3415 * of the logical span [map->m_lblk, map->m_lblk + map->m_len). 3416 * 3417 * Post-conditions on success: 3418 * - the returned value is the number of blocks beyond map->l_lblk 3419 * that are allocated and initialized. 3420 * It is guaranteed to be >= map->m_len. 3421 */ 3422static int ext4_ext_convert_to_initialized(handle_t *handle, 3423 struct inode *inode, 3424 struct ext4_map_blocks *map, 3425 struct ext4_ext_path **ppath, 3426 int flags) 3427{ 3428 struct ext4_ext_path *path = *ppath; 3429 struct ext4_sb_info *sbi; 3430 struct ext4_extent_header *eh; 3431 struct ext4_map_blocks split_map; 3432 struct ext4_extent zero_ex1, zero_ex2; 3433 struct ext4_extent *ex, *abut_ex; 3434 ext4_lblk_t ee_block, eof_block; 3435 unsigned int ee_len, depth, map_len = map->m_len; 3436 int allocated = 0, max_zeroout = 0; 3437 int err = 0; 3438 int split_flag = EXT4_EXT_DATA_VALID2; 3439 3440 ext_debug(inode, "logical block %llu, max_blocks %u\n", 3441 (unsigned long long)map->m_lblk, map_len); 3442 3443 sbi = EXT4_SB(inode->i_sb); 3444 eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) 3445 >> inode->i_sb->s_blocksize_bits; 3446 if (eof_block < map->m_lblk + map_len) 3447 eof_block = map->m_lblk + map_len; 3448 3449 depth = ext_depth(inode); 3450 eh = path[depth].p_hdr; 3451 ex = path[depth].p_ext; 3452 ee_block = le32_to_cpu(ex->ee_block); 3453 ee_len = ext4_ext_get_actual_len(ex); 3454 zero_ex1.ee_len = 0; 3455 zero_ex2.ee_len = 0; 3456 3457 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3458 3459 /* Pre-conditions */ 3460 BUG_ON(!ext4_ext_is_unwritten(ex)); 3461 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); 3462 3463 /* 3464 * Attempt to transfer newly initialized blocks from the currently 3465 * unwritten extent to its neighbor. This is much cheaper 3466 * than an insertion followed by a merge as those involve costly 3467 * memmove() calls. Transferring to the left is the common case in 3468 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) 3469 * followed by append writes. 3470 * 3471 * Limitations of the current logic: 3472 * - L1: we do not deal with writes covering the whole extent. 3473 * This would require removing the extent if the transfer 3474 * is possible. 3475 * - L2: we only attempt to merge with an extent stored in the 3476 * same extent tree node. 3477 */ 3478 if ((map->m_lblk == ee_block) && 3479 /* See if we can merge left */ 3480 (map_len < ee_len) && /*L1*/ 3481 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ 3482 ext4_lblk_t prev_lblk; 3483 ext4_fsblk_t prev_pblk, ee_pblk; 3484 unsigned int prev_len; 3485 3486 abut_ex = ex - 1; 3487 prev_lblk = le32_to_cpu(abut_ex->ee_block); 3488 prev_len = ext4_ext_get_actual_len(abut_ex); 3489 prev_pblk = ext4_ext_pblock(abut_ex); 3490 ee_pblk = ext4_ext_pblock(ex); 3491 3492 /* 3493 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3494 * upon those conditions: 3495 * - C1: abut_ex is initialized, 3496 * - C2: abut_ex is logically abutting ex, 3497 * - C3: abut_ex is physically abutting ex, 3498 * - C4: abut_ex can receive the additional blocks without 3499 * overflowing the (initialized) length limit. 3500 */ 3501 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ 3502 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3503 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3504 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3505 err = ext4_ext_get_access(handle, inode, path + depth); 3506 if (err) 3507 goto out; 3508 3509 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3510 map, ex, abut_ex); 3511 3512 /* Shift the start of ex by 'map_len' blocks */ 3513 ex->ee_block = cpu_to_le32(ee_block + map_len); 3514 ext4_ext_store_pblock(ex, ee_pblk + map_len); 3515 ex->ee_len = cpu_to_le16(ee_len - map_len); 3516 ext4_ext_mark_unwritten(ex); /* Restore the flag */ 3517 3518 /* Extend abut_ex by 'map_len' blocks */ 3519 abut_ex->ee_len = cpu_to_le16(prev_len + map_len); 3520 3521 /* Result: number of initialized blocks past m_lblk */ 3522 allocated = map_len; 3523 } 3524 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && 3525 (map_len < ee_len) && /*L1*/ 3526 ex < EXT_LAST_EXTENT(eh)) { /*L2*/ 3527 /* See if we can merge right */ 3528 ext4_lblk_t next_lblk; 3529 ext4_fsblk_t next_pblk, ee_pblk; 3530 unsigned int next_len; 3531 3532 abut_ex = ex + 1; 3533 next_lblk = le32_to_cpu(abut_ex->ee_block); 3534 next_len = ext4_ext_get_actual_len(abut_ex); 3535 next_pblk = ext4_ext_pblock(abut_ex); 3536 ee_pblk = ext4_ext_pblock(ex); 3537 3538 /* 3539 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3540 * upon those conditions: 3541 * - C1: abut_ex is initialized, 3542 * - C2: abut_ex is logically abutting ex, 3543 * - C3: abut_ex is physically abutting ex, 3544 * - C4: abut_ex can receive the additional blocks without 3545 * overflowing the (initialized) length limit. 3546 */ 3547 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ 3548 ((map->m_lblk + map_len) == next_lblk) && /*C2*/ 3549 ((ee_pblk + ee_len) == next_pblk) && /*C3*/ 3550 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3551 err = ext4_ext_get_access(handle, inode, path + depth); 3552 if (err) 3553 goto out; 3554 3555 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3556 map, ex, abut_ex); 3557 3558 /* Shift the start of abut_ex by 'map_len' blocks */ 3559 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); 3560 ext4_ext_store_pblock(abut_ex, next_pblk - map_len); 3561 ex->ee_len = cpu_to_le16(ee_len - map_len); 3562 ext4_ext_mark_unwritten(ex); /* Restore the flag */ 3563 3564 /* Extend abut_ex by 'map_len' blocks */ 3565 abut_ex->ee_len = cpu_to_le16(next_len + map_len); 3566 3567 /* Result: number of initialized blocks past m_lblk */ 3568 allocated = map_len; 3569 } 3570 } 3571 if (allocated) { 3572 /* Mark the block containing both extents as dirty */ 3573 err = ext4_ext_dirty(handle, inode, path + depth); 3574 3575 /* Update path to point to the right extent */ 3576 path[depth].p_ext = abut_ex; 3577 goto out; 3578 } else 3579 allocated = ee_len - (map->m_lblk - ee_block); 3580 3581 WARN_ON(map->m_lblk < ee_block); 3582 /* 3583 * It is safe to convert extent to initialized via explicit 3584 * zeroout only if extent is fully inside i_size or new_size. 3585 */ 3586 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3587 3588 if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3589 max_zeroout = sbi->s_extent_max_zeroout_kb >> 3590 (inode->i_sb->s_blocksize_bits - 10); 3591 3592 /* 3593 * five cases: 3594 * 1. split the extent into three extents. 3595 * 2. split the extent into two extents, zeroout the head of the first 3596 * extent. 3597 * 3. split the extent into two extents, zeroout the tail of the second 3598 * extent. 3599 * 4. split the extent into two extents with out zeroout. 3600 * 5. no splitting needed, just possibly zeroout the head and / or the 3601 * tail of the extent. 3602 */ 3603 split_map.m_lblk = map->m_lblk; 3604 split_map.m_len = map->m_len; 3605 3606 if (max_zeroout && (allocated > split_map.m_len)) { 3607 if (allocated <= max_zeroout) { 3608 /* case 3 or 5 */ 3609 zero_ex1.ee_block = 3610 cpu_to_le32(split_map.m_lblk + 3611 split_map.m_len); 3612 zero_ex1.ee_len = 3613 cpu_to_le16(allocated - split_map.m_len); 3614 ext4_ext_store_pblock(&zero_ex1, 3615 ext4_ext_pblock(ex) + split_map.m_lblk + 3616 split_map.m_len - ee_block); 3617 err = ext4_ext_zeroout(inode, &zero_ex1); 3618 if (err) 3619 goto fallback; 3620 split_map.m_len = allocated; 3621 } 3622 if (split_map.m_lblk - ee_block + split_map.m_len < 3623 max_zeroout) { 3624 /* case 2 or 5 */ 3625 if (split_map.m_lblk != ee_block) { 3626 zero_ex2.ee_block = ex->ee_block; 3627 zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - 3628 ee_block); 3629 ext4_ext_store_pblock(&zero_ex2, 3630 ext4_ext_pblock(ex)); 3631 err = ext4_ext_zeroout(inode, &zero_ex2); 3632 if (err) 3633 goto fallback; 3634 } 3635 3636 split_map.m_len += split_map.m_lblk - ee_block; 3637 split_map.m_lblk = ee_block; 3638 allocated = map->m_len; 3639 } 3640 } 3641 3642fallback: 3643 err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, 3644 flags); 3645 if (err > 0) 3646 err = 0; 3647out: 3648 /* If we have gotten a failure, don't zero out status tree */ 3649 if (!err) { 3650 err = ext4_zeroout_es(inode, &zero_ex1); 3651 if (!err) 3652 err = ext4_zeroout_es(inode, &zero_ex2); 3653 } 3654 return err ? err : allocated; 3655} 3656 3657/* 3658 * This function is called by ext4_ext_map_blocks() from 3659 * ext4_get_blocks_dio_write() when DIO to write 3660 * to an unwritten extent. 3661 * 3662 * Writing to an unwritten extent may result in splitting the unwritten 3663 * extent into multiple initialized/unwritten extents (up to three) 3664 * There are three possibilities: 3665 * a> There is no split required: Entire extent should be unwritten 3666 * b> Splits in two extents: Write is happening at either end of the extent 3667 * c> Splits in three extents: Somone is writing in middle of the extent 3668 * 3669 * This works the same way in the case of initialized -> unwritten conversion. 3670 * 3671 * One of more index blocks maybe needed if the extent tree grow after 3672 * the unwritten extent split. To prevent ENOSPC occur at the IO 3673 * complete, we need to split the unwritten extent before DIO submit 3674 * the IO. The unwritten extent called at this time will be split 3675 * into three unwritten extent(at most). After IO complete, the part 3676 * being filled will be convert to initialized by the end_io callback function 3677 * via ext4_convert_unwritten_extents(). 3678 * 3679 * Returns the size of unwritten extent to be written on success. 3680 */ 3681static int ext4_split_convert_extents(handle_t *handle, 3682 struct inode *inode, 3683 struct ext4_map_blocks *map, 3684 struct ext4_ext_path **ppath, 3685 int flags) 3686{ 3687 struct ext4_ext_path *path = *ppath; 3688 ext4_lblk_t eof_block; 3689 ext4_lblk_t ee_block; 3690 struct ext4_extent *ex; 3691 unsigned int ee_len; 3692 int split_flag = 0, depth; 3693 3694 ext_debug(inode, "logical block %llu, max_blocks %u\n", 3695 (unsigned long long)map->m_lblk, map->m_len); 3696 3697 eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) 3698 >> inode->i_sb->s_blocksize_bits; 3699 if (eof_block < map->m_lblk + map->m_len) 3700 eof_block = map->m_lblk + map->m_len; 3701 /* 3702 * It is safe to convert extent to initialized via explicit 3703 * zeroout only if extent is fully inside i_size or new_size. 3704 */ 3705 depth = ext_depth(inode); 3706 ex = path[depth].p_ext; 3707 ee_block = le32_to_cpu(ex->ee_block); 3708 ee_len = ext4_ext_get_actual_len(ex); 3709 3710 /* Convert to unwritten */ 3711 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { 3712 split_flag |= EXT4_EXT_DATA_VALID1; 3713 /* Convert to initialized */ 3714 } else if (flags & EXT4_GET_BLOCKS_CONVERT) { 3715 split_flag |= ee_block + ee_len <= eof_block ? 3716 EXT4_EXT_MAY_ZEROOUT : 0; 3717 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); 3718 } 3719 flags |= EXT4_GET_BLOCKS_PRE_IO; 3720 return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); 3721} 3722 3723static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3724 struct inode *inode, 3725 struct ext4_map_blocks *map, 3726 struct ext4_ext_path **ppath) 3727{ 3728 struct ext4_ext_path *path = *ppath; 3729 struct ext4_extent *ex; 3730 ext4_lblk_t ee_block; 3731 unsigned int ee_len; 3732 int depth; 3733 int err = 0; 3734 3735 depth = ext_depth(inode); 3736 ex = path[depth].p_ext; 3737 ee_block = le32_to_cpu(ex->ee_block); 3738 ee_len = ext4_ext_get_actual_len(ex); 3739 3740 ext_debug(inode, "logical block %llu, max_blocks %u\n", 3741 (unsigned long long)ee_block, ee_len); 3742 3743 /* If extent is larger than requested it is a clear sign that we still 3744 * have some extent state machine issues left. So extent_split is still 3745 * required. 3746 * TODO: Once all related issues will be fixed this situation should be 3747 * illegal. 3748 */ 3749 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3750#ifdef CONFIG_EXT4_DEBUG 3751 ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," 3752 " len %u; IO logical block %llu, len %u", 3753 inode->i_ino, (unsigned long long)ee_block, ee_len, 3754 (unsigned long long)map->m_lblk, map->m_len); 3755#endif 3756 err = ext4_split_convert_extents(handle, inode, map, ppath, 3757 EXT4_GET_BLOCKS_CONVERT); 3758 if (err < 0) 3759 return err; 3760 path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3761 if (IS_ERR(path)) 3762 return PTR_ERR(path); 3763 depth = ext_depth(inode); 3764 ex = path[depth].p_ext; 3765 } 3766 3767 err = ext4_ext_get_access(handle, inode, path + depth); 3768 if (err) 3769 goto out; 3770 /* first mark the extent as initialized */ 3771 ext4_ext_mark_initialized(ex); 3772 3773 /* note: ext4_ext_correct_indexes() isn't needed here because 3774 * borders are not changed 3775 */ 3776 ext4_ext_try_to_merge(handle, inode, path, ex); 3777 3778 /* Mark modified extent as dirty */ 3779 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3780out: 3781 ext4_ext_show_leaf(inode, path); 3782 return err; 3783} 3784 3785static int 3786convert_initialized_extent(handle_t *handle, struct inode *inode, 3787 struct ext4_map_blocks *map, 3788 struct ext4_ext_path **ppath, 3789 unsigned int *allocated) 3790{ 3791 struct ext4_ext_path *path = *ppath; 3792 struct ext4_extent *ex; 3793 ext4_lblk_t ee_block; 3794 unsigned int ee_len; 3795 int depth; 3796 int err = 0; 3797 3798 /* 3799 * Make sure that the extent is no bigger than we support with 3800 * unwritten extent 3801 */ 3802 if (map->m_len > EXT_UNWRITTEN_MAX_LEN) 3803 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; 3804 3805 depth = ext_depth(inode); 3806 ex = path[depth].p_ext; 3807 ee_block = le32_to_cpu(ex->ee_block); 3808 ee_len = ext4_ext_get_actual_len(ex); 3809 3810 ext_debug(inode, "logical block %llu, max_blocks %u\n", 3811 (unsigned long long)ee_block, ee_len); 3812 3813 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3814 err = ext4_split_convert_extents(handle, inode, map, ppath, 3815 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); 3816 if (err < 0) 3817 return err; 3818 path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3819 if (IS_ERR(path)) 3820 return PTR_ERR(path); 3821 depth = ext_depth(inode); 3822 ex = path[depth].p_ext; 3823 if (!ex) { 3824 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3825 (unsigned long) map->m_lblk); 3826 return -EFSCORRUPTED; 3827 } 3828 } 3829 3830 err = ext4_ext_get_access(handle, inode, path + depth); 3831 if (err) 3832 return err; 3833 /* first mark the extent as unwritten */ 3834 ext4_ext_mark_unwritten(ex); 3835 3836 /* note: ext4_ext_correct_indexes() isn't needed here because 3837 * borders are not changed 3838 */ 3839 ext4_ext_try_to_merge(handle, inode, path, ex); 3840 3841 /* Mark modified extent as dirty */ 3842 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3843 if (err) 3844 return err; 3845 ext4_ext_show_leaf(inode, path); 3846 3847 ext4_update_inode_fsync_trans(handle, inode, 1); 3848 3849 map->m_flags |= EXT4_MAP_UNWRITTEN; 3850 if (*allocated > map->m_len) 3851 *allocated = map->m_len; 3852 map->m_len = *allocated; 3853 return 0; 3854} 3855 3856static int 3857ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, 3858 struct ext4_map_blocks *map, 3859 struct ext4_ext_path **ppath, int flags, 3860 unsigned int allocated, ext4_fsblk_t newblock) 3861{ 3862 struct ext4_ext_path __maybe_unused *path = *ppath; 3863 int ret = 0; 3864 int err = 0; 3865 3866 ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", 3867 (unsigned long long)map->m_lblk, map->m_len, flags, 3868 allocated); 3869 ext4_ext_show_leaf(inode, path); 3870 3871 /* 3872 * When writing into unwritten space, we should not fail to 3873 * allocate metadata blocks for the new extent block if needed. 3874 */ 3875 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; 3876 3877 trace_ext4_ext_handle_unwritten_extents(inode, map, flags, 3878 allocated, newblock); 3879 3880 /* get_block() before submitting IO, split the extent */ 3881 if (flags & EXT4_GET_BLOCKS_PRE_IO) { 3882 ret = ext4_split_convert_extents(handle, inode, map, ppath, 3883 flags | EXT4_GET_BLOCKS_CONVERT); 3884 if (ret < 0) { 3885 err = ret; 3886 goto out2; 3887 } 3888 /* 3889 * shouldn't get a 0 return when splitting an extent unless 3890 * m_len is 0 (bug) or extent has been corrupted 3891 */ 3892 if (unlikely(ret == 0)) { 3893 EXT4_ERROR_INODE(inode, 3894 "unexpected ret == 0, m_len = %u", 3895 map->m_len); 3896 err = -EFSCORRUPTED; 3897 goto out2; 3898 } 3899 map->m_flags |= EXT4_MAP_UNWRITTEN; 3900 goto out; 3901 } 3902 /* IO end_io complete, convert the filled extent to written */ 3903 if (flags & EXT4_GET_BLOCKS_CONVERT) { 3904 err = ext4_convert_unwritten_extents_endio(handle, inode, map, 3905 ppath); 3906 if (err < 0) 3907 goto out2; 3908 ext4_update_inode_fsync_trans(handle, inode, 1); 3909 goto map_out; 3910 } 3911 /* buffered IO cases */ 3912 /* 3913 * repeat fallocate creation request 3914 * we already have an unwritten extent 3915 */ 3916 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { 3917 map->m_flags |= EXT4_MAP_UNWRITTEN; 3918 goto map_out; 3919 } 3920 3921 /* buffered READ or buffered write_begin() lookup */ 3922 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3923 /* 3924 * We have blocks reserved already. We 3925 * return allocated blocks so that delalloc 3926 * won't do block reservation for us. But 3927 * the buffer head will be unmapped so that 3928 * a read from the block returns 0s. 3929 */ 3930 map->m_flags |= EXT4_MAP_UNWRITTEN; 3931 goto out1; 3932 } 3933 3934 /* 3935 * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. 3936 * For buffered writes, at writepage time, etc. Convert a 3937 * discovered unwritten extent to written. 3938 */ 3939 ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); 3940 if (ret < 0) { 3941 err = ret; 3942 goto out2; 3943 } 3944 ext4_update_inode_fsync_trans(handle, inode, 1); 3945 /* 3946 * shouldn't get a 0 return when converting an unwritten extent 3947 * unless m_len is 0 (bug) or extent has been corrupted 3948 */ 3949 if (unlikely(ret == 0)) { 3950 EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", 3951 map->m_len); 3952 err = -EFSCORRUPTED; 3953 goto out2; 3954 } 3955 3956out: 3957 allocated = ret; 3958 map->m_flags |= EXT4_MAP_NEW; 3959map_out: 3960 map->m_flags |= EXT4_MAP_MAPPED; 3961out1: 3962 map->m_pblk = newblock; 3963 if (allocated > map->m_len) 3964 allocated = map->m_len; 3965 map->m_len = allocated; 3966 ext4_ext_show_leaf(inode, path); 3967out2: 3968 return err ? err : allocated; 3969} 3970 3971/* 3972 * get_implied_cluster_alloc - check to see if the requested 3973 * allocation (in the map structure) overlaps with a cluster already 3974 * allocated in an extent. 3975 * @sb The filesystem superblock structure 3976 * @map The requested lblk->pblk mapping 3977 * @ex The extent structure which might contain an implied 3978 * cluster allocation 3979 * 3980 * This function is called by ext4_ext_map_blocks() after we failed to 3981 * find blocks that were already in the inode's extent tree. Hence, 3982 * we know that the beginning of the requested region cannot overlap 3983 * the extent from the inode's extent tree. There are three cases we 3984 * want to catch. The first is this case: 3985 * 3986 * |--- cluster # N--| 3987 * |--- extent ---| |---- requested region ---| 3988 * |==========| 3989 * 3990 * The second case that we need to test for is this one: 3991 * 3992 * |--------- cluster # N ----------------| 3993 * |--- requested region --| |------- extent ----| 3994 * |=======================| 3995 * 3996 * The third case is when the requested region lies between two extents 3997 * within the same cluster: 3998 * |------------- cluster # N-------------| 3999 * |----- ex -----| |---- ex_right ----| 4000 * |------ requested region ------| 4001 * |================| 4002 * 4003 * In each of the above cases, we need to set the map->m_pblk and 4004 * map->m_len so it corresponds to the return the extent labelled as 4005 * "|====|" from cluster #N, since it is already in use for data in 4006 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to 4007 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated 4008 * as a new "allocated" block region. Otherwise, we will return 0 and 4009 * ext4_ext_map_blocks() will then allocate one or more new clusters 4010 * by calling ext4_mb_new_blocks(). 4011 */ 4012static int get_implied_cluster_alloc(struct super_block *sb, 4013 struct ext4_map_blocks *map, 4014 struct ext4_extent *ex, 4015 struct ext4_ext_path *path) 4016{ 4017 struct ext4_sb_info *sbi = EXT4_SB(sb); 4018 ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4019 ext4_lblk_t ex_cluster_start, ex_cluster_end; 4020 ext4_lblk_t rr_cluster_start; 4021 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4022 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4023 unsigned short ee_len = ext4_ext_get_actual_len(ex); 4024 4025 /* The extent passed in that we are trying to match */ 4026 ex_cluster_start = EXT4_B2C(sbi, ee_block); 4027 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); 4028 4029 /* The requested region passed into ext4_map_blocks() */ 4030 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); 4031 4032 if ((rr_cluster_start == ex_cluster_end) || 4033 (rr_cluster_start == ex_cluster_start)) { 4034 if (rr_cluster_start == ex_cluster_end) 4035 ee_start += ee_len - 1; 4036 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; 4037 map->m_len = min(map->m_len, 4038 (unsigned) sbi->s_cluster_ratio - c_offset); 4039 /* 4040 * Check for and handle this case: 4041 * 4042 * |--------- cluster # N-------------| 4043 * |------- extent ----| 4044 * |--- requested region ---| 4045 * |===========| 4046 */ 4047 4048 if (map->m_lblk < ee_block) 4049 map->m_len = min(map->m_len, ee_block - map->m_lblk); 4050 4051 /* 4052 * Check for the case where there is already another allocated 4053 * block to the right of 'ex' but before the end of the cluster. 4054 * 4055 * |------------- cluster # N-------------| 4056 * |----- ex -----| |---- ex_right ----| 4057 * |------ requested region ------| 4058 * |================| 4059 */ 4060 if (map->m_lblk > ee_block) { 4061 ext4_lblk_t next = ext4_ext_next_allocated_block(path); 4062 map->m_len = min(map->m_len, next - map->m_lblk); 4063 } 4064 4065 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); 4066 return 1; 4067 } 4068 4069 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); 4070 return 0; 4071} 4072 4073 4074/* 4075 * Block allocation/map/preallocation routine for extents based files 4076 * 4077 * 4078 * Need to be called with 4079 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 4080 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 4081 * 4082 * return > 0, number of blocks already mapped/allocated 4083 * if create == 0 and these are pre-allocated blocks 4084 * buffer head is unmapped 4085 * otherwise blocks are mapped 4086 * 4087 * return = 0, if plain look up failed (blocks have not been allocated) 4088 * buffer head is unmapped 4089 * 4090 * return < 0, error case. 4091 */ 4092int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 4093 struct ext4_map_blocks *map, int flags) 4094{ 4095 struct ext4_ext_path *path = NULL; 4096 struct ext4_extent newex, *ex, ex2; 4097 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4098 ext4_fsblk_t newblock = 0, pblk; 4099 int err = 0, depth, ret; 4100 unsigned int allocated = 0, offset = 0; 4101 unsigned int allocated_clusters = 0; 4102 struct ext4_allocation_request ar; 4103 ext4_lblk_t cluster_offset; 4104 4105 ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); 4106 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4107 4108 /* find extent for this block */ 4109 path = ext4_find_extent(inode, map->m_lblk, NULL, 0); 4110 if (IS_ERR(path)) { 4111 err = PTR_ERR(path); 4112 path = NULL; 4113 goto out; 4114 } 4115 4116 depth = ext_depth(inode); 4117 4118 /* 4119 * consistent leaf must not be empty; 4120 * this situation is possible, though, _during_ tree modification; 4121 * this is why assert can't be put in ext4_find_extent() 4122 */ 4123 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 4124 EXT4_ERROR_INODE(inode, "bad extent address " 4125 "lblock: %lu, depth: %d pblock %lld", 4126 (unsigned long) map->m_lblk, depth, 4127 path[depth].p_block); 4128 err = -EFSCORRUPTED; 4129 goto out; 4130 } 4131 4132 ex = path[depth].p_ext; 4133 if (ex) { 4134 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4135 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4136 unsigned short ee_len; 4137 4138 4139 /* 4140 * unwritten extents are treated as holes, except that 4141 * we split out initialized portions during a write. 4142 */ 4143 ee_len = ext4_ext_get_actual_len(ex); 4144 4145 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); 4146 4147 /* if found extent covers block, simply return it */ 4148 if (in_range(map->m_lblk, ee_block, ee_len)) { 4149 newblock = map->m_lblk - ee_block + ee_start; 4150 /* number of remaining blocks in the extent */ 4151 allocated = ee_len - (map->m_lblk - ee_block); 4152 ext_debug(inode, "%u fit into %u:%d -> %llu\n", 4153 map->m_lblk, ee_block, ee_len, newblock); 4154 4155 /* 4156 * If the extent is initialized check whether the 4157 * caller wants to convert it to unwritten. 4158 */ 4159 if ((!ext4_ext_is_unwritten(ex)) && 4160 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4161 err = convert_initialized_extent(handle, 4162 inode, map, &path, &allocated); 4163 goto out; 4164 } else if (!ext4_ext_is_unwritten(ex)) { 4165 map->m_flags |= EXT4_MAP_MAPPED; 4166 map->m_pblk = newblock; 4167 if (allocated > map->m_len) 4168 allocated = map->m_len; 4169 map->m_len = allocated; 4170 ext4_ext_show_leaf(inode, path); 4171 goto out; 4172 } 4173 4174 ret = ext4_ext_handle_unwritten_extents( 4175 handle, inode, map, &path, flags, 4176 allocated, newblock); 4177 if (ret < 0) 4178 err = ret; 4179 else 4180 allocated = ret; 4181 goto out; 4182 } 4183 } 4184 4185 /* 4186 * requested block isn't allocated yet; 4187 * we couldn't try to create block if create flag is zero 4188 */ 4189 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4190 ext4_lblk_t hole_start, hole_len; 4191 4192 hole_start = map->m_lblk; 4193 hole_len = ext4_ext_determine_hole(inode, path, &hole_start); 4194 /* 4195 * put just found gap into cache to speed up 4196 * subsequent requests 4197 */ 4198 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); 4199 4200 /* Update hole_len to reflect hole size after map->m_lblk */ 4201 if (hole_start != map->m_lblk) 4202 hole_len -= map->m_lblk - hole_start; 4203 map->m_pblk = 0; 4204 map->m_len = min_t(unsigned int, map->m_len, hole_len); 4205 4206 goto out; 4207 } 4208 4209 /* 4210 * Okay, we need to do block allocation. 4211 */ 4212 newex.ee_block = cpu_to_le32(map->m_lblk); 4213 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4214 4215 /* 4216 * If we are doing bigalloc, check to see if the extent returned 4217 * by ext4_find_extent() implies a cluster we can use. 4218 */ 4219 if (cluster_offset && ex && 4220 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4221 ar.len = allocated = map->m_len; 4222 newblock = map->m_pblk; 4223 goto got_allocated_blocks; 4224 } 4225 4226 /* find neighbour allocated blocks */ 4227 ar.lleft = map->m_lblk; 4228 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 4229 if (err) 4230 goto out; 4231 ar.lright = map->m_lblk; 4232 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); 4233 if (err < 0) 4234 goto out; 4235 4236 /* Check if the extent after searching to the right implies a 4237 * cluster we can use. */ 4238 if ((sbi->s_cluster_ratio > 1) && err && 4239 get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { 4240 ar.len = allocated = map->m_len; 4241 newblock = map->m_pblk; 4242 goto got_allocated_blocks; 4243 } 4244 4245 /* 4246 * See if request is beyond maximum number of blocks we can have in 4247 * a single extent. For an initialized extent this limit is 4248 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is 4249 * EXT_UNWRITTEN_MAX_LEN. 4250 */ 4251 if (map->m_len > EXT_INIT_MAX_LEN && 4252 !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) 4253 map->m_len = EXT_INIT_MAX_LEN; 4254 else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && 4255 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) 4256 map->m_len = EXT_UNWRITTEN_MAX_LEN; 4257 4258 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 4259 newex.ee_len = cpu_to_le16(map->m_len); 4260 err = ext4_ext_check_overlap(sbi, inode, &newex, path); 4261 if (err) 4262 allocated = ext4_ext_get_actual_len(&newex); 4263 else 4264 allocated = map->m_len; 4265 4266 /* allocate new block */ 4267 ar.inode = inode; 4268 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 4269 ar.logical = map->m_lblk; 4270 /* 4271 * We calculate the offset from the beginning of the cluster 4272 * for the logical block number, since when we allocate a 4273 * physical cluster, the physical block should start at the 4274 * same offset from the beginning of the cluster. This is 4275 * needed so that future calls to get_implied_cluster_alloc() 4276 * work correctly. 4277 */ 4278 offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4279 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 4280 ar.goal -= offset; 4281 ar.logical -= offset; 4282 if (S_ISREG(inode->i_mode)) 4283 ar.flags = EXT4_MB_HINT_DATA; 4284 else 4285 /* disable in-core preallocation for non-regular files */ 4286 ar.flags = 0; 4287 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4288 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4289 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4290 ar.flags |= EXT4_MB_DELALLOC_RESERVED; 4291 if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 4292 ar.flags |= EXT4_MB_USE_RESERVED; 4293 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4294 if (!newblock) 4295 goto out; 4296 allocated_clusters = ar.len; 4297 ar.len = EXT4_C2B(sbi, ar.len) - offset; 4298 ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", 4299 ar.goal, newblock, ar.len, allocated); 4300 if (ar.len > allocated) 4301 ar.len = allocated; 4302 4303got_allocated_blocks: 4304 /* try to insert new extent into found leaf and return */ 4305 pblk = newblock + offset; 4306 ext4_ext_store_pblock(&newex, pblk); 4307 newex.ee_len = cpu_to_le16(ar.len); 4308 /* Mark unwritten */ 4309 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { 4310 ext4_ext_mark_unwritten(&newex); 4311 map->m_flags |= EXT4_MAP_UNWRITTEN; 4312 } 4313 4314 err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); 4315 if (err) { 4316 if (allocated_clusters) { 4317 int fb_flags = 0; 4318 4319 /* 4320 * free data blocks we just allocated. 4321 * not a good idea to call discard here directly, 4322 * but otherwise we'd need to call it every free(). 4323 */ 4324 ext4_discard_preallocations(inode, 0); 4325 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4326 fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; 4327 ext4_free_blocks(handle, inode, NULL, newblock, 4328 EXT4_C2B(sbi, allocated_clusters), 4329 fb_flags); 4330 } 4331 goto out; 4332 } 4333 4334 /* 4335 * Reduce the reserved cluster count to reflect successful deferred 4336 * allocation of delayed allocated clusters or direct allocation of 4337 * clusters discovered to be delayed allocated. Once allocated, a 4338 * cluster is not included in the reserved count. 4339 */ 4340 if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { 4341 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4342 /* 4343 * When allocating delayed allocated clusters, simply 4344 * reduce the reserved cluster count and claim quota 4345 */ 4346 ext4_da_update_reserve_space(inode, allocated_clusters, 4347 1); 4348 } else { 4349 ext4_lblk_t lblk, len; 4350 unsigned int n; 4351 4352 /* 4353 * When allocating non-delayed allocated clusters 4354 * (from fallocate, filemap, DIO, or clusters 4355 * allocated when delalloc has been disabled by 4356 * ext4_nonda_switch), reduce the reserved cluster 4357 * count by the number of allocated clusters that 4358 * have previously been delayed allocated. Quota 4359 * has been claimed by ext4_mb_new_blocks() above, 4360 * so release the quota reservations made for any 4361 * previously delayed allocated clusters. 4362 */ 4363 lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); 4364 len = allocated_clusters << sbi->s_cluster_bits; 4365 n = ext4_es_delayed_clu(inode, lblk, len); 4366 if (n > 0) 4367 ext4_da_update_reserve_space(inode, (int) n, 0); 4368 } 4369 } 4370 4371 /* 4372 * Cache the extent and update transaction to commit on fdatasync only 4373 * when it is _not_ an unwritten extent. 4374 */ 4375 if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) 4376 ext4_update_inode_fsync_trans(handle, inode, 1); 4377 else 4378 ext4_update_inode_fsync_trans(handle, inode, 0); 4379 4380 map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); 4381 map->m_pblk = pblk; 4382 map->m_len = ar.len; 4383 allocated = map->m_len; 4384 ext4_ext_show_leaf(inode, path); 4385out: 4386 ext4_ext_drop_refs(path); 4387 kfree(path); 4388 4389 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4390 err ? err : allocated); 4391 return err ? err : allocated; 4392} 4393 4394int ext4_ext_truncate(handle_t *handle, struct inode *inode) 4395{ 4396 struct super_block *sb = inode->i_sb; 4397 ext4_lblk_t last_block; 4398 int err = 0; 4399 4400 /* 4401 * TODO: optimization is possible here. 4402 * Probably we need not scan at all, 4403 * because page truncation is enough. 4404 */ 4405 4406 /* we have to know where to truncate from in crash case */ 4407 EXT4_I(inode)->i_disksize = inode->i_size; 4408 err = ext4_mark_inode_dirty(handle, inode); 4409 if (err) 4410 return err; 4411 4412 last_block = (inode->i_size + sb->s_blocksize - 1) 4413 >> EXT4_BLOCK_SIZE_BITS(sb); 4414retry: 4415 err = ext4_es_remove_extent(inode, last_block, 4416 EXT_MAX_BLOCKS - last_block); 4417 if (err == -ENOMEM) { 4418 cond_resched(); 4419 congestion_wait(BLK_RW_ASYNC, HZ/50); 4420 goto retry; 4421 } 4422 if (err) 4423 return err; 4424retry_remove_space: 4425 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4426 if (err == -ENOMEM) { 4427 cond_resched(); 4428 congestion_wait(BLK_RW_ASYNC, HZ/50); 4429 goto retry_remove_space; 4430 } 4431 return err; 4432} 4433 4434static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4435 ext4_lblk_t len, loff_t new_size, 4436 int flags) 4437{ 4438 struct inode *inode = file_inode(file); 4439 handle_t *handle; 4440 int ret = 0; 4441 int ret2 = 0, ret3 = 0; 4442 int retries = 0; 4443 int depth = 0; 4444 struct ext4_map_blocks map; 4445 unsigned int credits; 4446 loff_t epos; 4447 4448 BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); 4449 map.m_lblk = offset; 4450 map.m_len = len; 4451 /* 4452 * Don't normalize the request if it can fit in one extent so 4453 * that it doesn't get unnecessarily split into multiple 4454 * extents. 4455 */ 4456 if (len <= EXT_UNWRITTEN_MAX_LEN) 4457 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4458 4459 /* 4460 * credits to insert 1 extent into extent tree 4461 */ 4462 credits = ext4_chunk_trans_blocks(inode, len); 4463 depth = ext_depth(inode); 4464 4465retry: 4466 while (ret >= 0 && len) { 4467 /* 4468 * Recalculate credits when extent tree depth changes. 4469 */ 4470 if (depth != ext_depth(inode)) { 4471 credits = ext4_chunk_trans_blocks(inode, len); 4472 depth = ext_depth(inode); 4473 } 4474 4475 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4476 credits); 4477 if (IS_ERR(handle)) { 4478 ret = PTR_ERR(handle); 4479 break; 4480 } 4481 ret = ext4_map_blocks(handle, inode, &map, flags); 4482 if (ret <= 0) { 4483 ext4_debug("inode #%lu: block %u: len %u: " 4484 "ext4_ext_map_blocks returned %d", 4485 inode->i_ino, map.m_lblk, 4486 map.m_len, ret); 4487 ext4_mark_inode_dirty(handle, inode); 4488 ret2 = ext4_journal_stop(handle); 4489 break; 4490 } 4491 map.m_lblk += ret; 4492 map.m_len = len = len - ret; 4493 epos = (loff_t)map.m_lblk << inode->i_blkbits; 4494 inode->i_ctime = current_time(inode); 4495 if (new_size) { 4496 if (epos > new_size) 4497 epos = new_size; 4498 if (ext4_update_inode_size(inode, epos) & 0x1) 4499 inode->i_mtime = inode->i_ctime; 4500 } 4501 ret2 = ext4_mark_inode_dirty(handle, inode); 4502 ext4_update_inode_fsync_trans(handle, inode, 1); 4503 ret3 = ext4_journal_stop(handle); 4504 ret2 = ret3 ? ret3 : ret2; 4505 if (unlikely(ret2)) 4506 break; 4507 } 4508 if (ret == -ENOSPC && 4509 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4510 ret = 0; 4511 goto retry; 4512 } 4513 4514 return ret > 0 ? ret2 : ret; 4515} 4516 4517static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); 4518 4519static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); 4520 4521static long ext4_zero_range(struct file *file, loff_t offset, 4522 loff_t len, int mode) 4523{ 4524 struct inode *inode = file_inode(file); 4525 handle_t *handle = NULL; 4526 unsigned int max_blocks; 4527 loff_t new_size = 0; 4528 int ret = 0; 4529 int flags; 4530 int credits; 4531 int partial_begin, partial_end; 4532 loff_t start, end; 4533 ext4_lblk_t lblk; 4534 unsigned int blkbits = inode->i_blkbits; 4535 4536 trace_ext4_zero_range(inode, offset, len, mode); 4537 4538 /* Call ext4_force_commit to flush all data in case of data=journal. */ 4539 if (ext4_should_journal_data(inode)) { 4540 ret = ext4_force_commit(inode->i_sb); 4541 if (ret) 4542 return ret; 4543 } 4544 4545 /* 4546 * Round up offset. This is not fallocate, we need to zero out 4547 * blocks, so convert interior block aligned part of the range to 4548 * unwritten and possibly manually zero out unaligned parts of the 4549 * range. 4550 */ 4551 start = round_up(offset, 1 << blkbits); 4552 end = round_down((offset + len), 1 << blkbits); 4553 4554 if (start < offset || end > offset + len) 4555 return -EINVAL; 4556 partial_begin = offset & ((1 << blkbits) - 1); 4557 partial_end = (offset + len) & ((1 << blkbits) - 1); 4558 4559 lblk = start >> blkbits; 4560 max_blocks = (end >> blkbits); 4561 if (max_blocks < lblk) 4562 max_blocks = 0; 4563 else 4564 max_blocks -= lblk; 4565 4566 inode_lock(inode); 4567 4568 /* 4569 * Indirect files do not support unwritten extents 4570 */ 4571 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4572 ret = -EOPNOTSUPP; 4573 goto out_mutex; 4574 } 4575 4576 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4577 (offset + len > inode->i_size || 4578 offset + len > EXT4_I(inode)->i_disksize)) { 4579 new_size = offset + len; 4580 ret = inode_newsize_ok(inode, new_size); 4581 if (ret) 4582 goto out_mutex; 4583 } 4584 4585 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4586 4587 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4588 inode_dio_wait(inode); 4589 4590 ret = file_modified(file); 4591 if (ret) 4592 goto out_mutex; 4593 4594 /* Preallocate the range including the unaligned edges */ 4595 if (partial_begin || partial_end) { 4596 ret = ext4_alloc_file_blocks(file, 4597 round_down(offset, 1 << blkbits) >> blkbits, 4598 (round_up((offset + len), 1 << blkbits) - 4599 round_down(offset, 1 << blkbits)) >> blkbits, 4600 new_size, flags); 4601 if (ret) 4602 goto out_mutex; 4603 4604 } 4605 4606 /* Zero range excluding the unaligned edges */ 4607 if (max_blocks > 0) { 4608 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4609 EXT4_EX_NOCACHE); 4610 4611 /* 4612 * Prevent page faults from reinstantiating pages we have 4613 * released from page cache. 4614 */ 4615 down_write(&EXT4_I(inode)->i_mmap_sem); 4616 4617 ret = ext4_break_layouts(inode); 4618 if (ret) { 4619 up_write(&EXT4_I(inode)->i_mmap_sem); 4620 goto out_mutex; 4621 } 4622 4623 ret = ext4_update_disksize_before_punch(inode, offset, len); 4624 if (ret) { 4625 up_write(&EXT4_I(inode)->i_mmap_sem); 4626 goto out_mutex; 4627 } 4628 /* Now release the pages and zero block aligned part of pages */ 4629 truncate_pagecache_range(inode, start, end - 1); 4630 inode->i_mtime = inode->i_ctime = current_time(inode); 4631 4632 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4633 flags); 4634 up_write(&EXT4_I(inode)->i_mmap_sem); 4635 if (ret) 4636 goto out_mutex; 4637 } 4638 if (!partial_begin && !partial_end) 4639 goto out_mutex; 4640 4641 /* 4642 * In worst case we have to writeout two nonadjacent unwritten 4643 * blocks and update the inode 4644 */ 4645 credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; 4646 if (ext4_should_journal_data(inode)) 4647 credits += 2; 4648 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 4649 if (IS_ERR(handle)) { 4650 ret = PTR_ERR(handle); 4651 ext4_std_error(inode->i_sb, ret); 4652 goto out_mutex; 4653 } 4654 4655 inode->i_mtime = inode->i_ctime = current_time(inode); 4656 if (new_size) 4657 ext4_update_inode_size(inode, new_size); 4658 ret = ext4_mark_inode_dirty(handle, inode); 4659 if (unlikely(ret)) 4660 goto out_handle; 4661 /* Zero out partial block at the edges of the range */ 4662 ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4663 if (ret >= 0) 4664 ext4_update_inode_fsync_trans(handle, inode, 1); 4665 4666 if (file->f_flags & O_SYNC) 4667 ext4_handle_sync(handle); 4668 4669out_handle: 4670 ext4_journal_stop(handle); 4671out_mutex: 4672 inode_unlock(inode); 4673 return ret; 4674} 4675 4676/* 4677 * preallocate space for a file. This implements ext4's fallocate file 4678 * operation, which gets called from sys_fallocate system call. 4679 * For block-mapped files, posix_fallocate should fall back to the method 4680 * of writing zeroes to the required new blocks (the same behavior which is 4681 * expected for file systems which do not support fallocate() system call). 4682 */ 4683long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4684{ 4685 struct inode *inode = file_inode(file); 4686 loff_t new_size = 0; 4687 unsigned int max_blocks; 4688 int ret = 0; 4689 int flags; 4690 ext4_lblk_t lblk; 4691 unsigned int blkbits = inode->i_blkbits; 4692 4693 /* 4694 * Encrypted inodes can't handle collapse range or insert 4695 * range since we would need to re-encrypt blocks with a 4696 * different IV or XTS tweak (which are based on the logical 4697 * block number). 4698 */ 4699 if (IS_ENCRYPTED(inode) && 4700 (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) 4701 return -EOPNOTSUPP; 4702 4703 /* Return error if mode is not supported */ 4704 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 4705 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | 4706 FALLOC_FL_INSERT_RANGE)) 4707 return -EOPNOTSUPP; 4708 4709 inode_lock(inode); 4710 ret = ext4_convert_inline_data(inode); 4711 inode_unlock(inode); 4712 if (ret) 4713 goto exit; 4714 4715 if (mode & FALLOC_FL_PUNCH_HOLE) { 4716 ret = ext4_punch_hole(file, offset, len); 4717 goto exit; 4718 } 4719 4720 if (mode & FALLOC_FL_COLLAPSE_RANGE) { 4721 ret = ext4_collapse_range(file, offset, len); 4722 goto exit; 4723 } 4724 4725 if (mode & FALLOC_FL_INSERT_RANGE) { 4726 ret = ext4_insert_range(file, offset, len); 4727 goto exit; 4728 } 4729 4730 if (mode & FALLOC_FL_ZERO_RANGE) { 4731 ret = ext4_zero_range(file, offset, len, mode); 4732 goto exit; 4733 } 4734 trace_ext4_fallocate_enter(inode, offset, len, mode); 4735 lblk = offset >> blkbits; 4736 4737 max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); 4738 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4739 4740 inode_lock(inode); 4741 4742 /* 4743 * We only support preallocation for extent-based files only 4744 */ 4745 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4746 ret = -EOPNOTSUPP; 4747 goto out; 4748 } 4749 4750 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4751 (offset + len > inode->i_size || 4752 offset + len > EXT4_I(inode)->i_disksize)) { 4753 new_size = offset + len; 4754 ret = inode_newsize_ok(inode, new_size); 4755 if (ret) 4756 goto out; 4757 } 4758 4759 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4760 inode_dio_wait(inode); 4761 4762 ret = file_modified(file); 4763 if (ret) 4764 goto out; 4765 4766 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); 4767 if (ret) 4768 goto out; 4769 4770 if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4771 ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, 4772 EXT4_I(inode)->i_sync_tid); 4773 } 4774out: 4775 inode_unlock(inode); 4776 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4777exit: 4778 return ret; 4779} 4780 4781/* 4782 * This function convert a range of blocks to written extents 4783 * The caller of this function will pass the start offset and the size. 4784 * all unwritten extents within this range will be converted to 4785 * written extents. 4786 * 4787 * This function is called from the direct IO end io call back 4788 * function, to convert the fallocated extents after IO is completed. 4789 * Returns 0 on success. 4790 */ 4791int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, 4792 loff_t offset, ssize_t len) 4793{ 4794 unsigned int max_blocks; 4795 int ret = 0, ret2 = 0, ret3 = 0; 4796 struct ext4_map_blocks map; 4797 unsigned int blkbits = inode->i_blkbits; 4798 unsigned int credits = 0; 4799 4800 map.m_lblk = offset >> blkbits; 4801 max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); 4802 4803 if (!handle) { 4804 /* 4805 * credits to insert 1 extent into extent tree 4806 */ 4807 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4808 } 4809 while (ret >= 0 && ret < max_blocks) { 4810 map.m_lblk += ret; 4811 map.m_len = (max_blocks -= ret); 4812 if (credits) { 4813 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4814 credits); 4815 if (IS_ERR(handle)) { 4816 ret = PTR_ERR(handle); 4817 break; 4818 } 4819 } 4820 ret = ext4_map_blocks(handle, inode, &map, 4821 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4822 if (ret <= 0) 4823 ext4_warning(inode->i_sb, 4824 "inode #%lu: block %u: len %u: " 4825 "ext4_ext_map_blocks returned %d", 4826 inode->i_ino, map.m_lblk, 4827 map.m_len, ret); 4828 ret2 = ext4_mark_inode_dirty(handle, inode); 4829 if (credits) { 4830 ret3 = ext4_journal_stop(handle); 4831 if (unlikely(ret3)) 4832 ret2 = ret3; 4833 } 4834 4835 if (ret <= 0 || ret2) 4836 break; 4837 } 4838 return ret > 0 ? ret2 : ret; 4839} 4840 4841int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) 4842{ 4843 int ret = 0, err = 0; 4844 struct ext4_io_end_vec *io_end_vec; 4845 4846 /* 4847 * This is somewhat ugly but the idea is clear: When transaction is 4848 * reserved, everything goes into it. Otherwise we rather start several 4849 * smaller transactions for conversion of each extent separately. 4850 */ 4851 if (handle) { 4852 handle = ext4_journal_start_reserved(handle, 4853 EXT4_HT_EXT_CONVERT); 4854 if (IS_ERR(handle)) 4855 return PTR_ERR(handle); 4856 } 4857 4858 list_for_each_entry(io_end_vec, &io_end->list_vec, list) { 4859 ret = ext4_convert_unwritten_extents(handle, io_end->inode, 4860 io_end_vec->offset, 4861 io_end_vec->size); 4862 if (ret) 4863 break; 4864 } 4865 4866 if (handle) 4867 err = ext4_journal_stop(handle); 4868 4869 return ret < 0 ? ret : err; 4870} 4871 4872static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) 4873{ 4874 __u64 physical = 0; 4875 __u64 length = 0; 4876 int blockbits = inode->i_sb->s_blocksize_bits; 4877 int error = 0; 4878 u16 iomap_type; 4879 4880 /* in-inode? */ 4881 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4882 struct ext4_iloc iloc; 4883 int offset; /* offset of xattr in inode */ 4884 4885 error = ext4_get_inode_loc(inode, &iloc); 4886 if (error) 4887 return error; 4888 physical = (__u64)iloc.bh->b_blocknr << blockbits; 4889 offset = EXT4_GOOD_OLD_INODE_SIZE + 4890 EXT4_I(inode)->i_extra_isize; 4891 physical += offset; 4892 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 4893 brelse(iloc.bh); 4894 iomap_type = IOMAP_INLINE; 4895 } else if (EXT4_I(inode)->i_file_acl) { /* external block */ 4896 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; 4897 length = inode->i_sb->s_blocksize; 4898 iomap_type = IOMAP_MAPPED; 4899 } else { 4900 /* no in-inode or external block for xattr, so return -ENOENT */ 4901 error = -ENOENT; 4902 goto out; 4903 } 4904 4905 iomap->addr = physical; 4906 iomap->offset = 0; 4907 iomap->length = length; 4908 iomap->type = iomap_type; 4909 iomap->flags = 0; 4910out: 4911 return error; 4912} 4913 4914static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, 4915 loff_t length, unsigned flags, 4916 struct iomap *iomap, struct iomap *srcmap) 4917{ 4918 int error; 4919 4920 error = ext4_iomap_xattr_fiemap(inode, iomap); 4921 if (error == 0 && (offset >= iomap->length)) 4922 error = -ENOENT; 4923 return error; 4924} 4925 4926static const struct iomap_ops ext4_iomap_xattr_ops = { 4927 .iomap_begin = ext4_iomap_xattr_begin, 4928}; 4929 4930static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) 4931{ 4932 u64 maxbytes; 4933 4934 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4935 maxbytes = inode->i_sb->s_maxbytes; 4936 else 4937 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 4938 4939 if (*len == 0) 4940 return -EINVAL; 4941 if (start > maxbytes) 4942 return -EFBIG; 4943 4944 /* 4945 * Shrink request scope to what the fs can actually handle. 4946 */ 4947 if (*len > maxbytes || (maxbytes - *len) < start) 4948 *len = maxbytes - start; 4949 return 0; 4950} 4951 4952int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4953 u64 start, u64 len) 4954{ 4955 int error = 0; 4956 4957 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 4958 error = ext4_ext_precache(inode); 4959 if (error) 4960 return error; 4961 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; 4962 } 4963 4964 /* 4965 * For bitmap files the maximum size limit could be smaller than 4966 * s_maxbytes, so check len here manually instead of just relying on the 4967 * generic check. 4968 */ 4969 error = ext4_fiemap_check_ranges(inode, start, &len); 4970 if (error) 4971 return error; 4972 4973 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 4974 fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; 4975 return iomap_fiemap(inode, fieinfo, start, len, 4976 &ext4_iomap_xattr_ops); 4977 } 4978 4979 return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); 4980} 4981 4982int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, 4983 __u64 start, __u64 len) 4984{ 4985 ext4_lblk_t start_blk, len_blks; 4986 __u64 last_blk; 4987 int error = 0; 4988 4989 if (ext4_has_inline_data(inode)) { 4990 int has_inline; 4991 4992 down_read(&EXT4_I(inode)->xattr_sem); 4993 has_inline = ext4_has_inline_data(inode); 4994 up_read(&EXT4_I(inode)->xattr_sem); 4995 if (has_inline) 4996 return 0; 4997 } 4998 4999 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 5000 error = ext4_ext_precache(inode); 5001 if (error) 5002 return error; 5003 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; 5004 } 5005 5006 error = fiemap_prep(inode, fieinfo, start, &len, 0); 5007 if (error) 5008 return error; 5009 5010 error = ext4_fiemap_check_ranges(inode, start, &len); 5011 if (error) 5012 return error; 5013 5014 start_blk = start >> inode->i_sb->s_blocksize_bits; 5015 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 5016 if (last_blk >= EXT_MAX_BLOCKS) 5017 last_blk = EXT_MAX_BLOCKS-1; 5018 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 5019 5020 /* 5021 * Walk the extent tree gathering extent information 5022 * and pushing extents back to the user. 5023 */ 5024 return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); 5025} 5026 5027/* 5028 * ext4_ext_shift_path_extents: 5029 * Shift the extents of a path structure lying between path[depth].p_ext 5030 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells 5031 * if it is right shift or left shift operation. 5032 */ 5033static int 5034ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, 5035 struct inode *inode, handle_t *handle, 5036 enum SHIFT_DIRECTION SHIFT) 5037{ 5038 int depth, err = 0; 5039 struct ext4_extent *ex_start, *ex_last; 5040 bool update = false; 5041 int credits, restart_credits; 5042 depth = path->p_depth; 5043 5044 while (depth >= 0) { 5045 if (depth == path->p_depth) { 5046 ex_start = path[depth].p_ext; 5047 if (!ex_start) 5048 return -EFSCORRUPTED; 5049 5050 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); 5051 /* leaf + sb + inode */ 5052 credits = 3; 5053 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { 5054 update = true; 5055 /* extent tree + sb + inode */ 5056 credits = depth + 2; 5057 } 5058 5059 restart_credits = ext4_writepage_trans_blocks(inode); 5060 err = ext4_datasem_ensure_credits(handle, inode, credits, 5061 restart_credits, 0); 5062 if (err) { 5063 if (err > 0) 5064 err = -EAGAIN; 5065 goto out; 5066 } 5067 5068 err = ext4_ext_get_access(handle, inode, path + depth); 5069 if (err) 5070 goto out; 5071 5072 while (ex_start <= ex_last) { 5073 if (SHIFT == SHIFT_LEFT) { 5074 le32_add_cpu(&ex_start->ee_block, 5075 -shift); 5076 /* Try to merge to the left. */ 5077 if ((ex_start > 5078 EXT_FIRST_EXTENT(path[depth].p_hdr)) 5079 && 5080 ext4_ext_try_to_merge_right(inode, 5081 path, ex_start - 1)) 5082 ex_last--; 5083 else 5084 ex_start++; 5085 } else { 5086 le32_add_cpu(&ex_last->ee_block, shift); 5087 ext4_ext_try_to_merge_right(inode, path, 5088 ex_last); 5089 ex_last--; 5090 } 5091 } 5092 err = ext4_ext_dirty(handle, inode, path + depth); 5093 if (err) 5094 goto out; 5095 5096 if (--depth < 0 || !update) 5097 break; 5098 } 5099 5100 /* Update index too */ 5101 err = ext4_ext_get_access(handle, inode, path + depth); 5102 if (err) 5103 goto out; 5104 5105 if (SHIFT == SHIFT_LEFT) 5106 le32_add_cpu(&path[depth].p_idx->ei_block, -shift); 5107 else 5108 le32_add_cpu(&path[depth].p_idx->ei_block, shift); 5109 err = ext4_ext_dirty(handle, inode, path + depth); 5110 if (err) 5111 goto out; 5112 5113 /* we are done if current index is not a starting index */ 5114 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) 5115 break; 5116 5117 depth--; 5118 } 5119 5120out: 5121 return err; 5122} 5123 5124/* 5125 * ext4_ext_shift_extents: 5126 * All the extents which lies in the range from @start to the last allocated 5127 * block for the @inode are shifted either towards left or right (depending 5128 * upon @SHIFT) by @shift blocks. 5129 * On success, 0 is returned, error otherwise. 5130 */ 5131static int 5132ext4_ext_shift_extents(struct inode *inode, handle_t *handle, 5133 ext4_lblk_t start, ext4_lblk_t shift, 5134 enum SHIFT_DIRECTION SHIFT) 5135{ 5136 struct ext4_ext_path *path; 5137 int ret = 0, depth; 5138 struct ext4_extent *extent; 5139 ext4_lblk_t stop, *iterator, ex_start, ex_end; 5140 ext4_lblk_t tmp = EXT_MAX_BLOCKS; 5141 5142 /* Let path point to the last extent */ 5143 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 5144 EXT4_EX_NOCACHE); 5145 if (IS_ERR(path)) 5146 return PTR_ERR(path); 5147 5148 depth = path->p_depth; 5149 extent = path[depth].p_ext; 5150 if (!extent) 5151 goto out; 5152 5153 stop = le32_to_cpu(extent->ee_block); 5154 5155 /* 5156 * For left shifts, make sure the hole on the left is big enough to 5157 * accommodate the shift. For right shifts, make sure the last extent 5158 * won't be shifted beyond EXT_MAX_BLOCKS. 5159 */ 5160 if (SHIFT == SHIFT_LEFT) { 5161 path = ext4_find_extent(inode, start - 1, &path, 5162 EXT4_EX_NOCACHE); 5163 if (IS_ERR(path)) 5164 return PTR_ERR(path); 5165 depth = path->p_depth; 5166 extent = path[depth].p_ext; 5167 if (extent) { 5168 ex_start = le32_to_cpu(extent->ee_block); 5169 ex_end = le32_to_cpu(extent->ee_block) + 5170 ext4_ext_get_actual_len(extent); 5171 } else { 5172 ex_start = 0; 5173 ex_end = 0; 5174 } 5175 5176 if ((start == ex_start && shift > ex_start) || 5177 (shift > start - ex_end)) { 5178 ret = -EINVAL; 5179 goto out; 5180 } 5181 } else { 5182 if (shift > EXT_MAX_BLOCKS - 5183 (stop + ext4_ext_get_actual_len(extent))) { 5184 ret = -EINVAL; 5185 goto out; 5186 } 5187 } 5188 5189 /* 5190 * In case of left shift, iterator points to start and it is increased 5191 * till we reach stop. In case of right shift, iterator points to stop 5192 * and it is decreased till we reach start. 5193 */ 5194again: 5195 ret = 0; 5196 if (SHIFT == SHIFT_LEFT) 5197 iterator = &start; 5198 else 5199 iterator = &stop; 5200 5201 if (tmp != EXT_MAX_BLOCKS) 5202 *iterator = tmp; 5203 5204 /* 5205 * Its safe to start updating extents. Start and stop are unsigned, so 5206 * in case of right shift if extent with 0 block is reached, iterator 5207 * becomes NULL to indicate the end of the loop. 5208 */ 5209 while (iterator && start <= stop) { 5210 path = ext4_find_extent(inode, *iterator, &path, 5211 EXT4_EX_NOCACHE); 5212 if (IS_ERR(path)) 5213 return PTR_ERR(path); 5214 depth = path->p_depth; 5215 extent = path[depth].p_ext; 5216 if (!extent) { 5217 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 5218 (unsigned long) *iterator); 5219 return -EFSCORRUPTED; 5220 } 5221 if (SHIFT == SHIFT_LEFT && *iterator > 5222 le32_to_cpu(extent->ee_block)) { 5223 /* Hole, move to the next extent */ 5224 if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { 5225 path[depth].p_ext++; 5226 } else { 5227 *iterator = ext4_ext_next_allocated_block(path); 5228 continue; 5229 } 5230 } 5231 5232 tmp = *iterator; 5233 if (SHIFT == SHIFT_LEFT) { 5234 extent = EXT_LAST_EXTENT(path[depth].p_hdr); 5235 *iterator = le32_to_cpu(extent->ee_block) + 5236 ext4_ext_get_actual_len(extent); 5237 } else { 5238 extent = EXT_FIRST_EXTENT(path[depth].p_hdr); 5239 if (le32_to_cpu(extent->ee_block) > start) 5240 *iterator = le32_to_cpu(extent->ee_block) - 1; 5241 else if (le32_to_cpu(extent->ee_block) == start) 5242 iterator = NULL; 5243 else { 5244 extent = EXT_LAST_EXTENT(path[depth].p_hdr); 5245 while (le32_to_cpu(extent->ee_block) >= start) 5246 extent--; 5247 5248 if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) 5249 break; 5250 5251 extent++; 5252 iterator = NULL; 5253 } 5254 path[depth].p_ext = extent; 5255 } 5256 ret = ext4_ext_shift_path_extents(path, shift, inode, 5257 handle, SHIFT); 5258 /* iterator can be NULL which means we should break */ 5259 if (ret == -EAGAIN) 5260 goto again; 5261 if (ret) 5262 break; 5263 } 5264out: 5265 ext4_ext_drop_refs(path); 5266 kfree(path); 5267 return ret; 5268} 5269 5270/* 5271 * ext4_collapse_range: 5272 * This implements the fallocate's collapse range functionality for ext4 5273 * Returns: 0 and non-zero on error. 5274 */ 5275static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) 5276{ 5277 struct inode *inode = file_inode(file); 5278 struct super_block *sb = inode->i_sb; 5279 ext4_lblk_t punch_start, punch_stop; 5280 handle_t *handle; 5281 unsigned int credits; 5282 loff_t new_size, ioffset; 5283 int ret; 5284 5285 /* 5286 * We need to test this early because xfstests assumes that a 5287 * collapse range of (0, 1) will return EOPNOTSUPP if the file 5288 * system does not support collapse range. 5289 */ 5290 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5291 return -EOPNOTSUPP; 5292 5293 /* Collapse range works only on fs cluster size aligned regions. */ 5294 if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) 5295 return -EINVAL; 5296 5297 trace_ext4_collapse_range(inode, offset, len); 5298 5299 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5300 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 5301 5302 /* Call ext4_force_commit to flush all data in case of data=journal. */ 5303 if (ext4_should_journal_data(inode)) { 5304 ret = ext4_force_commit(inode->i_sb); 5305 if (ret) 5306 return ret; 5307 } 5308 5309 inode_lock(inode); 5310 /* 5311 * There is no need to overlap collapse range with EOF, in which case 5312 * it is effectively a truncate operation 5313 */ 5314 if (offset + len >= inode->i_size) { 5315 ret = -EINVAL; 5316 goto out_mutex; 5317 } 5318 5319 /* Currently just for extent based files */ 5320 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5321 ret = -EOPNOTSUPP; 5322 goto out_mutex; 5323 } 5324 5325 /* Wait for existing dio to complete */ 5326 inode_dio_wait(inode); 5327 5328 ret = file_modified(file); 5329 if (ret) 5330 goto out_mutex; 5331 5332 /* 5333 * Prevent page faults from reinstantiating pages we have released from 5334 * page cache. 5335 */ 5336 down_write(&EXT4_I(inode)->i_mmap_sem); 5337 5338 ret = ext4_break_layouts(inode); 5339 if (ret) 5340 goto out_mmap; 5341 5342 /* 5343 * Need to round down offset to be aligned with page size boundary 5344 * for page size > block size. 5345 */ 5346 ioffset = round_down(offset, PAGE_SIZE); 5347 /* 5348 * Write tail of the last page before removed range since it will get 5349 * removed from the page cache below. 5350 */ 5351 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); 5352 if (ret) 5353 goto out_mmap; 5354 /* 5355 * Write data that will be shifted to preserve them when discarding 5356 * page cache below. We are also protected from pages becoming dirty 5357 * by i_mmap_sem. 5358 */ 5359 ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, 5360 LLONG_MAX); 5361 if (ret) 5362 goto out_mmap; 5363 truncate_pagecache(inode, ioffset); 5364 5365 credits = ext4_writepage_trans_blocks(inode); 5366 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5367 if (IS_ERR(handle)) { 5368 ret = PTR_ERR(handle); 5369 goto out_mmap; 5370 } 5371 ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); 5372 5373 down_write(&EXT4_I(inode)->i_data_sem); 5374 ext4_discard_preallocations(inode, 0); 5375 5376 ret = ext4_es_remove_extent(inode, punch_start, 5377 EXT_MAX_BLOCKS - punch_start); 5378 if (ret) { 5379 up_write(&EXT4_I(inode)->i_data_sem); 5380 goto out_stop; 5381 } 5382 5383 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); 5384 if (ret) { 5385 up_write(&EXT4_I(inode)->i_data_sem); 5386 goto out_stop; 5387 } 5388 ext4_discard_preallocations(inode, 0); 5389 5390 ret = ext4_ext_shift_extents(inode, handle, punch_stop, 5391 punch_stop - punch_start, SHIFT_LEFT); 5392 if (ret) { 5393 up_write(&EXT4_I(inode)->i_data_sem); 5394 goto out_stop; 5395 } 5396 5397 new_size = inode->i_size - len; 5398 i_size_write(inode, new_size); 5399 EXT4_I(inode)->i_disksize = new_size; 5400 5401 up_write(&EXT4_I(inode)->i_data_sem); 5402 if (IS_SYNC(inode)) 5403 ext4_handle_sync(handle); 5404 inode->i_mtime = inode->i_ctime = current_time(inode); 5405 ret = ext4_mark_inode_dirty(handle, inode); 5406 ext4_update_inode_fsync_trans(handle, inode, 1); 5407 5408out_stop: 5409 ext4_journal_stop(handle); 5410 ext4_fc_stop_ineligible(sb); 5411out_mmap: 5412 up_write(&EXT4_I(inode)->i_mmap_sem); 5413out_mutex: 5414 inode_unlock(inode); 5415 return ret; 5416} 5417 5418/* 5419 * ext4_insert_range: 5420 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. 5421 * The data blocks starting from @offset to the EOF are shifted by @len 5422 * towards right to create a hole in the @inode. Inode size is increased 5423 * by len bytes. 5424 * Returns 0 on success, error otherwise. 5425 */ 5426static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) 5427{ 5428 struct inode *inode = file_inode(file); 5429 struct super_block *sb = inode->i_sb; 5430 handle_t *handle; 5431 struct ext4_ext_path *path; 5432 struct ext4_extent *extent; 5433 ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; 5434 unsigned int credits, ee_len; 5435 int ret = 0, depth, split_flag = 0; 5436 loff_t ioffset; 5437 5438 /* 5439 * We need to test this early because xfstests assumes that an 5440 * insert range of (0, 1) will return EOPNOTSUPP if the file 5441 * system does not support insert range. 5442 */ 5443 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5444 return -EOPNOTSUPP; 5445 5446 /* Insert range works only on fs cluster size aligned regions. */ 5447 if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) 5448 return -EINVAL; 5449 5450 trace_ext4_insert_range(inode, offset, len); 5451 5452 offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5453 len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); 5454 5455 /* Call ext4_force_commit to flush all data in case of data=journal */ 5456 if (ext4_should_journal_data(inode)) { 5457 ret = ext4_force_commit(inode->i_sb); 5458 if (ret) 5459 return ret; 5460 } 5461 5462 inode_lock(inode); 5463 /* Currently just for extent based files */ 5464 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5465 ret = -EOPNOTSUPP; 5466 goto out_mutex; 5467 } 5468 5469 /* Check whether the maximum file size would be exceeded */ 5470 if (len > inode->i_sb->s_maxbytes - inode->i_size) { 5471 ret = -EFBIG; 5472 goto out_mutex; 5473 } 5474 5475 /* Offset must be less than i_size */ 5476 if (offset >= inode->i_size) { 5477 ret = -EINVAL; 5478 goto out_mutex; 5479 } 5480 5481 /* Wait for existing dio to complete */ 5482 inode_dio_wait(inode); 5483 5484 ret = file_modified(file); 5485 if (ret) 5486 goto out_mutex; 5487 5488 /* 5489 * Prevent page faults from reinstantiating pages we have released from 5490 * page cache. 5491 */ 5492 down_write(&EXT4_I(inode)->i_mmap_sem); 5493 5494 ret = ext4_break_layouts(inode); 5495 if (ret) 5496 goto out_mmap; 5497 5498 /* 5499 * Need to round down to align start offset to page size boundary 5500 * for page size > block size. 5501 */ 5502 ioffset = round_down(offset, PAGE_SIZE); 5503 /* Write out all dirty pages */ 5504 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5505 LLONG_MAX); 5506 if (ret) 5507 goto out_mmap; 5508 truncate_pagecache(inode, ioffset); 5509 5510 credits = ext4_writepage_trans_blocks(inode); 5511 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5512 if (IS_ERR(handle)) { 5513 ret = PTR_ERR(handle); 5514 goto out_mmap; 5515 } 5516 ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); 5517 5518 /* Expand file to avoid data loss if there is error while shifting */ 5519 inode->i_size += len; 5520 EXT4_I(inode)->i_disksize += len; 5521 inode->i_mtime = inode->i_ctime = current_time(inode); 5522 ret = ext4_mark_inode_dirty(handle, inode); 5523 if (ret) 5524 goto out_stop; 5525 5526 down_write(&EXT4_I(inode)->i_data_sem); 5527 ext4_discard_preallocations(inode, 0); 5528 5529 path = ext4_find_extent(inode, offset_lblk, NULL, 0); 5530 if (IS_ERR(path)) { 5531 up_write(&EXT4_I(inode)->i_data_sem); 5532 goto out_stop; 5533 } 5534 5535 depth = ext_depth(inode); 5536 extent = path[depth].p_ext; 5537 if (extent) { 5538 ee_start_lblk = le32_to_cpu(extent->ee_block); 5539 ee_len = ext4_ext_get_actual_len(extent); 5540 5541 /* 5542 * If offset_lblk is not the starting block of extent, split 5543 * the extent @offset_lblk 5544 */ 5545 if ((offset_lblk > ee_start_lblk) && 5546 (offset_lblk < (ee_start_lblk + ee_len))) { 5547 if (ext4_ext_is_unwritten(extent)) 5548 split_flag = EXT4_EXT_MARK_UNWRIT1 | 5549 EXT4_EXT_MARK_UNWRIT2; 5550 ret = ext4_split_extent_at(handle, inode, &path, 5551 offset_lblk, split_flag, 5552 EXT4_EX_NOCACHE | 5553 EXT4_GET_BLOCKS_PRE_IO | 5554 EXT4_GET_BLOCKS_METADATA_NOFAIL); 5555 } 5556 5557 ext4_ext_drop_refs(path); 5558 kfree(path); 5559 if (ret < 0) { 5560 up_write(&EXT4_I(inode)->i_data_sem); 5561 goto out_stop; 5562 } 5563 } else { 5564 ext4_ext_drop_refs(path); 5565 kfree(path); 5566 } 5567 5568 ret = ext4_es_remove_extent(inode, offset_lblk, 5569 EXT_MAX_BLOCKS - offset_lblk); 5570 if (ret) { 5571 up_write(&EXT4_I(inode)->i_data_sem); 5572 goto out_stop; 5573 } 5574 5575 /* 5576 * if offset_lblk lies in a hole which is at start of file, use 5577 * ee_start_lblk to shift extents 5578 */ 5579 ret = ext4_ext_shift_extents(inode, handle, 5580 ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, 5581 len_lblk, SHIFT_RIGHT); 5582 5583 up_write(&EXT4_I(inode)->i_data_sem); 5584 if (IS_SYNC(inode)) 5585 ext4_handle_sync(handle); 5586 if (ret >= 0) 5587 ext4_update_inode_fsync_trans(handle, inode, 1); 5588 5589out_stop: 5590 ext4_journal_stop(handle); 5591 ext4_fc_stop_ineligible(sb); 5592out_mmap: 5593 up_write(&EXT4_I(inode)->i_mmap_sem); 5594out_mutex: 5595 inode_unlock(inode); 5596 return ret; 5597} 5598 5599/** 5600 * ext4_swap_extents() - Swap extents between two inodes 5601 * @handle: handle for this transaction 5602 * @inode1: First inode 5603 * @inode2: Second inode 5604 * @lblk1: Start block for first inode 5605 * @lblk2: Start block for second inode 5606 * @count: Number of blocks to swap 5607 * @unwritten: Mark second inode's extents as unwritten after swap 5608 * @erp: Pointer to save error value 5609 * 5610 * This helper routine does exactly what is promise "swap extents". All other 5611 * stuff such as page-cache locking consistency, bh mapping consistency or 5612 * extent's data copying must be performed by caller. 5613 * Locking: 5614 * i_mutex is held for both inodes 5615 * i_data_sem is locked for write for both inodes 5616 * Assumptions: 5617 * All pages from requested range are locked for both inodes 5618 */ 5619int 5620ext4_swap_extents(handle_t *handle, struct inode *inode1, 5621 struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, 5622 ext4_lblk_t count, int unwritten, int *erp) 5623{ 5624 struct ext4_ext_path *path1 = NULL; 5625 struct ext4_ext_path *path2 = NULL; 5626 int replaced_count = 0; 5627 5628 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); 5629 BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); 5630 BUG_ON(!inode_is_locked(inode1)); 5631 BUG_ON(!inode_is_locked(inode2)); 5632 5633 *erp = ext4_es_remove_extent(inode1, lblk1, count); 5634 if (unlikely(*erp)) 5635 return 0; 5636 *erp = ext4_es_remove_extent(inode2, lblk2, count); 5637 if (unlikely(*erp)) 5638 return 0; 5639 5640 while (count) { 5641 struct ext4_extent *ex1, *ex2, tmp_ex; 5642 ext4_lblk_t e1_blk, e2_blk; 5643 int e1_len, e2_len, len; 5644 int split = 0; 5645 5646 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); 5647 if (IS_ERR(path1)) { 5648 *erp = PTR_ERR(path1); 5649 path1 = NULL; 5650 finish: 5651 count = 0; 5652 goto repeat; 5653 } 5654 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); 5655 if (IS_ERR(path2)) { 5656 *erp = PTR_ERR(path2); 5657 path2 = NULL; 5658 goto finish; 5659 } 5660 ex1 = path1[path1->p_depth].p_ext; 5661 ex2 = path2[path2->p_depth].p_ext; 5662 /* Do we have something to swap ? */ 5663 if (unlikely(!ex2 || !ex1)) 5664 goto finish; 5665 5666 e1_blk = le32_to_cpu(ex1->ee_block); 5667 e2_blk = le32_to_cpu(ex2->ee_block); 5668 e1_len = ext4_ext_get_actual_len(ex1); 5669 e2_len = ext4_ext_get_actual_len(ex2); 5670 5671 /* Hole handling */ 5672 if (!in_range(lblk1, e1_blk, e1_len) || 5673 !in_range(lblk2, e2_blk, e2_len)) { 5674 ext4_lblk_t next1, next2; 5675 5676 /* if hole after extent, then go to next extent */ 5677 next1 = ext4_ext_next_allocated_block(path1); 5678 next2 = ext4_ext_next_allocated_block(path2); 5679 /* If hole before extent, then shift to that extent */ 5680 if (e1_blk > lblk1) 5681 next1 = e1_blk; 5682 if (e2_blk > lblk2) 5683 next2 = e2_blk; 5684 /* Do we have something to swap */ 5685 if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) 5686 goto finish; 5687 /* Move to the rightest boundary */ 5688 len = next1 - lblk1; 5689 if (len < next2 - lblk2) 5690 len = next2 - lblk2; 5691 if (len > count) 5692 len = count; 5693 lblk1 += len; 5694 lblk2 += len; 5695 count -= len; 5696 goto repeat; 5697 } 5698 5699 /* Prepare left boundary */ 5700 if (e1_blk < lblk1) { 5701 split = 1; 5702 *erp = ext4_force_split_extent_at(handle, inode1, 5703 &path1, lblk1, 0); 5704 if (unlikely(*erp)) 5705 goto finish; 5706 } 5707 if (e2_blk < lblk2) { 5708 split = 1; 5709 *erp = ext4_force_split_extent_at(handle, inode2, 5710 &path2, lblk2, 0); 5711 if (unlikely(*erp)) 5712 goto finish; 5713 } 5714 /* ext4_split_extent_at() may result in leaf extent split, 5715 * path must to be revalidated. */ 5716 if (split) 5717 goto repeat; 5718 5719 /* Prepare right boundary */ 5720 len = count; 5721 if (len > e1_blk + e1_len - lblk1) 5722 len = e1_blk + e1_len - lblk1; 5723 if (len > e2_blk + e2_len - lblk2) 5724 len = e2_blk + e2_len - lblk2; 5725 5726 if (len != e1_len) { 5727 split = 1; 5728 *erp = ext4_force_split_extent_at(handle, inode1, 5729 &path1, lblk1 + len, 0); 5730 if (unlikely(*erp)) 5731 goto finish; 5732 } 5733 if (len != e2_len) { 5734 split = 1; 5735 *erp = ext4_force_split_extent_at(handle, inode2, 5736 &path2, lblk2 + len, 0); 5737 if (*erp) 5738 goto finish; 5739 } 5740 /* ext4_split_extent_at() may result in leaf extent split, 5741 * path must to be revalidated. */ 5742 if (split) 5743 goto repeat; 5744 5745 BUG_ON(e2_len != e1_len); 5746 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); 5747 if (unlikely(*erp)) 5748 goto finish; 5749 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); 5750 if (unlikely(*erp)) 5751 goto finish; 5752 5753 /* Both extents are fully inside boundaries. Swap it now */ 5754 tmp_ex = *ex1; 5755 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); 5756 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); 5757 ex1->ee_len = cpu_to_le16(e2_len); 5758 ex2->ee_len = cpu_to_le16(e1_len); 5759 if (unwritten) 5760 ext4_ext_mark_unwritten(ex2); 5761 if (ext4_ext_is_unwritten(&tmp_ex)) 5762 ext4_ext_mark_unwritten(ex1); 5763 5764 ext4_ext_try_to_merge(handle, inode2, path2, ex2); 5765 ext4_ext_try_to_merge(handle, inode1, path1, ex1); 5766 *erp = ext4_ext_dirty(handle, inode2, path2 + 5767 path2->p_depth); 5768 if (unlikely(*erp)) 5769 goto finish; 5770 *erp = ext4_ext_dirty(handle, inode1, path1 + 5771 path1->p_depth); 5772 /* 5773 * Looks scarry ah..? second inode already points to new blocks, 5774 * and it was successfully dirtied. But luckily error may happen 5775 * only due to journal error, so full transaction will be 5776 * aborted anyway. 5777 */ 5778 if (unlikely(*erp)) 5779 goto finish; 5780 lblk1 += len; 5781 lblk2 += len; 5782 replaced_count += len; 5783 count -= len; 5784 5785 repeat: 5786 ext4_ext_drop_refs(path1); 5787 kfree(path1); 5788 ext4_ext_drop_refs(path2); 5789 kfree(path2); 5790 path1 = path2 = NULL; 5791 } 5792 return replaced_count; 5793} 5794 5795/* 5796 * ext4_clu_mapped - determine whether any block in a logical cluster has 5797 * been mapped to a physical cluster 5798 * 5799 * @inode - file containing the logical cluster 5800 * @lclu - logical cluster of interest 5801 * 5802 * Returns 1 if any block in the logical cluster is mapped, signifying 5803 * that a physical cluster has been allocated for it. Otherwise, 5804 * returns 0. Can also return negative error codes. Derived from 5805 * ext4_ext_map_blocks(). 5806 */ 5807int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) 5808{ 5809 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5810 struct ext4_ext_path *path; 5811 int depth, mapped = 0, err = 0; 5812 struct ext4_extent *extent; 5813 ext4_lblk_t first_lblk, first_lclu, last_lclu; 5814 5815 /* 5816 * if data can be stored inline, the logical cluster isn't 5817 * mapped - no physical clusters have been allocated, and the 5818 * file has no extents 5819 */ 5820 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || 5821 ext4_has_inline_data(inode)) 5822 return 0; 5823 5824 /* search for the extent closest to the first block in the cluster */ 5825 path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); 5826 if (IS_ERR(path)) { 5827 err = PTR_ERR(path); 5828 path = NULL; 5829 goto out; 5830 } 5831 5832 depth = ext_depth(inode); 5833 5834 /* 5835 * A consistent leaf must not be empty. This situation is possible, 5836 * though, _during_ tree modification, and it's why an assert can't 5837 * be put in ext4_find_extent(). 5838 */ 5839 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 5840 EXT4_ERROR_INODE(inode, 5841 "bad extent address - lblock: %lu, depth: %d, pblock: %lld", 5842 (unsigned long) EXT4_C2B(sbi, lclu), 5843 depth, path[depth].p_block); 5844 err = -EFSCORRUPTED; 5845 goto out; 5846 } 5847 5848 extent = path[depth].p_ext; 5849 5850 /* can't be mapped if the extent tree is empty */ 5851 if (extent == NULL) 5852 goto out; 5853 5854 first_lblk = le32_to_cpu(extent->ee_block); 5855 first_lclu = EXT4_B2C(sbi, first_lblk); 5856 5857 /* 5858 * Three possible outcomes at this point - found extent spanning 5859 * the target cluster, to the left of the target cluster, or to the 5860 * right of the target cluster. The first two cases are handled here. 5861 * The last case indicates the target cluster is not mapped. 5862 */ 5863 if (lclu >= first_lclu) { 5864 last_lclu = EXT4_B2C(sbi, first_lblk + 5865 ext4_ext_get_actual_len(extent) - 1); 5866 if (lclu <= last_lclu) { 5867 mapped = 1; 5868 } else { 5869 first_lblk = ext4_ext_next_allocated_block(path); 5870 first_lclu = EXT4_B2C(sbi, first_lblk); 5871 if (lclu == first_lclu) 5872 mapped = 1; 5873 } 5874 } 5875 5876out: 5877 ext4_ext_drop_refs(path); 5878 kfree(path); 5879 5880 return err ? err : mapped; 5881} 5882 5883/* 5884 * Updates physical block address and unwritten status of extent 5885 * starting at lblk start and of len. If such an extent doesn't exist, 5886 * this function splits the extent tree appropriately to create an 5887 * extent like this. This function is called in the fast commit 5888 * replay path. Returns 0 on success and error on failure. 5889 */ 5890int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, 5891 int len, int unwritten, ext4_fsblk_t pblk) 5892{ 5893 struct ext4_ext_path *path = NULL, *ppath; 5894 struct ext4_extent *ex; 5895 int ret; 5896 5897 path = ext4_find_extent(inode, start, NULL, 0); 5898 if (IS_ERR(path)) 5899 return PTR_ERR(path); 5900 ex = path[path->p_depth].p_ext; 5901 if (!ex) { 5902 ret = -EFSCORRUPTED; 5903 goto out; 5904 } 5905 5906 if (le32_to_cpu(ex->ee_block) != start || 5907 ext4_ext_get_actual_len(ex) != len) { 5908 /* We need to split this extent to match our extent first */ 5909 ppath = path; 5910 down_write(&EXT4_I(inode)->i_data_sem); 5911 ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); 5912 up_write(&EXT4_I(inode)->i_data_sem); 5913 if (ret) 5914 goto out; 5915 kfree(path); 5916 path = ext4_find_extent(inode, start, NULL, 0); 5917 if (IS_ERR(path)) 5918 return -1; 5919 ppath = path; 5920 ex = path[path->p_depth].p_ext; 5921 WARN_ON(le32_to_cpu(ex->ee_block) != start); 5922 if (ext4_ext_get_actual_len(ex) != len) { 5923 down_write(&EXT4_I(inode)->i_data_sem); 5924 ret = ext4_force_split_extent_at(NULL, inode, &ppath, 5925 start + len, 1); 5926 up_write(&EXT4_I(inode)->i_data_sem); 5927 if (ret) 5928 goto out; 5929 kfree(path); 5930 path = ext4_find_extent(inode, start, NULL, 0); 5931 if (IS_ERR(path)) 5932 return -EINVAL; 5933 ex = path[path->p_depth].p_ext; 5934 } 5935 } 5936 if (unwritten) 5937 ext4_ext_mark_unwritten(ex); 5938 else 5939 ext4_ext_mark_initialized(ex); 5940 ext4_ext_store_pblock(ex, pblk); 5941 down_write(&EXT4_I(inode)->i_data_sem); 5942 ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); 5943 up_write(&EXT4_I(inode)->i_data_sem); 5944out: 5945 ext4_ext_drop_refs(path); 5946 kfree(path); 5947 ext4_mark_inode_dirty(NULL, inode); 5948 return ret; 5949} 5950 5951/* Try to shrink the extent tree */ 5952void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) 5953{ 5954 struct ext4_ext_path *path = NULL; 5955 struct ext4_extent *ex; 5956 ext4_lblk_t old_cur, cur = 0; 5957 5958 while (cur < end) { 5959 path = ext4_find_extent(inode, cur, NULL, 0); 5960 if (IS_ERR(path)) 5961 return; 5962 ex = path[path->p_depth].p_ext; 5963 if (!ex) { 5964 ext4_ext_drop_refs(path); 5965 kfree(path); 5966 ext4_mark_inode_dirty(NULL, inode); 5967 return; 5968 } 5969 old_cur = cur; 5970 cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); 5971 if (cur <= old_cur) 5972 cur = old_cur + 1; 5973 ext4_ext_try_to_merge(NULL, inode, path, ex); 5974 down_write(&EXT4_I(inode)->i_data_sem); 5975 ext4_ext_dirty(NULL, inode, &path[path->p_depth]); 5976 up_write(&EXT4_I(inode)->i_data_sem); 5977 ext4_mark_inode_dirty(NULL, inode); 5978 ext4_ext_drop_refs(path); 5979 kfree(path); 5980 } 5981} 5982 5983/* Check if *cur is a hole and if it is, skip it */ 5984static int skip_hole(struct inode *inode, ext4_lblk_t *cur) 5985{ 5986 int ret; 5987 struct ext4_map_blocks map; 5988 5989 map.m_lblk = *cur; 5990 map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; 5991 5992 ret = ext4_map_blocks(NULL, inode, &map, 0); 5993 if (ret < 0) 5994 return ret; 5995 if (ret != 0) 5996 return 0; 5997 *cur = *cur + map.m_len; 5998 return 0; 5999} 6000 6001/* Count number of blocks used by this inode and update i_blocks */ 6002int ext4_ext_replay_set_iblocks(struct inode *inode) 6003{ 6004 struct ext4_ext_path *path = NULL, *path2 = NULL; 6005 struct ext4_extent *ex; 6006 ext4_lblk_t cur = 0, end; 6007 int numblks = 0, i, ret = 0; 6008 ext4_fsblk_t cmp1, cmp2; 6009 struct ext4_map_blocks map; 6010 6011 /* Determin the size of the file first */ 6012 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 6013 EXT4_EX_NOCACHE); 6014 if (IS_ERR(path)) 6015 return PTR_ERR(path); 6016 ex = path[path->p_depth].p_ext; 6017 if (!ex) { 6018 ext4_ext_drop_refs(path); 6019 kfree(path); 6020 goto out; 6021 } 6022 end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); 6023 ext4_ext_drop_refs(path); 6024 kfree(path); 6025 6026 /* Count the number of data blocks */ 6027 cur = 0; 6028 while (cur < end) { 6029 map.m_lblk = cur; 6030 map.m_len = end - cur; 6031 ret = ext4_map_blocks(NULL, inode, &map, 0); 6032 if (ret < 0) 6033 break; 6034 if (ret > 0) 6035 numblks += ret; 6036 cur = cur + map.m_len; 6037 } 6038 6039 /* 6040 * Count the number of extent tree blocks. We do it by looking up 6041 * two successive extents and determining the difference between 6042 * their paths. When path is different for 2 successive extents 6043 * we compare the blocks in the path at each level and increment 6044 * iblocks by total number of differences found. 6045 */ 6046 cur = 0; 6047 ret = skip_hole(inode, &cur); 6048 if (ret < 0) 6049 goto out; 6050 path = ext4_find_extent(inode, cur, NULL, 0); 6051 if (IS_ERR(path)) 6052 goto out; 6053 numblks += path->p_depth; 6054 ext4_ext_drop_refs(path); 6055 kfree(path); 6056 while (cur < end) { 6057 path = ext4_find_extent(inode, cur, NULL, 0); 6058 if (IS_ERR(path)) 6059 break; 6060 ex = path[path->p_depth].p_ext; 6061 if (!ex) { 6062 ext4_ext_drop_refs(path); 6063 kfree(path); 6064 return 0; 6065 } 6066 cur = max(cur + 1, le32_to_cpu(ex->ee_block) + 6067 ext4_ext_get_actual_len(ex)); 6068 ret = skip_hole(inode, &cur); 6069 if (ret < 0) { 6070 ext4_ext_drop_refs(path); 6071 kfree(path); 6072 break; 6073 } 6074 path2 = ext4_find_extent(inode, cur, NULL, 0); 6075 if (IS_ERR(path2)) { 6076 ext4_ext_drop_refs(path); 6077 kfree(path); 6078 break; 6079 } 6080 ex = path2[path2->p_depth].p_ext; 6081 for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { 6082 cmp1 = cmp2 = 0; 6083 if (i <= path->p_depth) 6084 cmp1 = path[i].p_bh ? 6085 path[i].p_bh->b_blocknr : 0; 6086 if (i <= path2->p_depth) 6087 cmp2 = path2[i].p_bh ? 6088 path2[i].p_bh->b_blocknr : 0; 6089 if (cmp1 != cmp2 && cmp2 != 0) 6090 numblks++; 6091 } 6092 ext4_ext_drop_refs(path); 6093 ext4_ext_drop_refs(path2); 6094 kfree(path); 6095 kfree(path2); 6096 } 6097 6098out: 6099 inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); 6100 ext4_mark_inode_dirty(NULL, inode); 6101 return 0; 6102} 6103 6104int ext4_ext_clear_bb(struct inode *inode) 6105{ 6106 struct ext4_ext_path *path = NULL; 6107 struct ext4_extent *ex; 6108 ext4_lblk_t cur = 0, end; 6109 int j, ret = 0; 6110 struct ext4_map_blocks map; 6111 6112 /* Determin the size of the file first */ 6113 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 6114 EXT4_EX_NOCACHE); 6115 if (IS_ERR(path)) 6116 return PTR_ERR(path); 6117 ex = path[path->p_depth].p_ext; 6118 if (!ex) { 6119 ext4_ext_drop_refs(path); 6120 kfree(path); 6121 return 0; 6122 } 6123 end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); 6124 ext4_ext_drop_refs(path); 6125 kfree(path); 6126 6127 cur = 0; 6128 while (cur < end) { 6129 map.m_lblk = cur; 6130 map.m_len = end - cur; 6131 ret = ext4_map_blocks(NULL, inode, &map, 0); 6132 if (ret < 0) 6133 break; 6134 if (ret > 0) { 6135 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 6136 if (!IS_ERR_OR_NULL(path)) { 6137 for (j = 0; j < path->p_depth; j++) { 6138 6139 ext4_mb_mark_bb(inode->i_sb, 6140 path[j].p_block, 1, 0); 6141 ext4_fc_record_regions(inode->i_sb, inode->i_ino, 6142 0, path[j].p_block, 1, 1); 6143 } 6144 ext4_ext_drop_refs(path); 6145 kfree(path); 6146 } 6147 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 6148 ext4_fc_record_regions(inode->i_sb, inode->i_ino, 6149 map.m_lblk, map.m_pblk, map.m_len, 1); 6150 } 6151 cur = cur + map.m_len; 6152 } 6153 6154 return 0; 6155} 6156