1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6#include <linux/sched.h> 7#include <linux/slab.h> 8#include <linux/blkdev.h> 9#include <linux/list_sort.h> 10#include <linux/iversion.h> 11#include "misc.h" 12#include "ctree.h" 13#include "tree-log.h" 14#include "disk-io.h" 15#include "locking.h" 16#include "print-tree.h" 17#include "backref.h" 18#include "compression.h" 19#include "qgroup.h" 20#include "inode-map.h" 21#include "block-group.h" 22#include "space-info.h" 23 24/* magic values for the inode_only field in btrfs_log_inode: 25 * 26 * LOG_INODE_ALL means to log everything 27 * LOG_INODE_EXISTS means to log just enough to recreate the inode 28 * during log replay 29 */ 30enum { 31 LOG_INODE_ALL, 32 LOG_INODE_EXISTS, 33 LOG_OTHER_INODE, 34 LOG_OTHER_INODE_ALL, 35}; 36 37/* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80/* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89enum { 90 LOG_WALK_PIN_ONLY, 91 LOG_WALK_REPLAY_INODES, 92 LOG_WALK_REPLAY_DIR_INDEX, 93 LOG_WALK_REPLAY_ALL, 94}; 95 96static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct btrfs_inode *inode, 98 int inode_only, 99 struct btrfs_log_ctx *ctx); 100static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_path *path, u64 objectid); 103static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_root *log, 106 struct btrfs_path *path, 107 u64 dirid, int del_all); 108 109/* 110 * tree logging is a special write ahead log used to make sure that 111 * fsyncs and O_SYNCs can happen without doing full tree commits. 112 * 113 * Full tree commits are expensive because they require commonly 114 * modified blocks to be recowed, creating many dirty pages in the 115 * extent tree an 4x-6x higher write load than ext3. 116 * 117 * Instead of doing a tree commit on every fsync, we use the 118 * key ranges and transaction ids to find items for a given file or directory 119 * that have changed in this transaction. Those items are copied into 120 * a special tree (one per subvolume root), that tree is written to disk 121 * and then the fsync is considered complete. 122 * 123 * After a crash, items are copied out of the log-tree back into the 124 * subvolume tree. Any file data extents found are recorded in the extent 125 * allocation tree, and the log-tree freed. 126 * 127 * The log tree is read three times, once to pin down all the extents it is 128 * using in ram and once, once to create all the inodes logged in the tree 129 * and once to do all the other items. 130 */ 131 132/* 133 * start a sub transaction and setup the log tree 134 * this increments the log tree writer count to make the people 135 * syncing the tree wait for us to finish 136 */ 137static int start_log_trans(struct btrfs_trans_handle *trans, 138 struct btrfs_root *root, 139 struct btrfs_log_ctx *ctx) 140{ 141 struct btrfs_fs_info *fs_info = root->fs_info; 142 int ret = 0; 143 144 mutex_lock(&root->log_mutex); 145 146 if (root->log_root) { 147 if (btrfs_need_log_full_commit(trans)) { 148 ret = -EAGAIN; 149 goto out; 150 } 151 152 if (!root->log_start_pid) { 153 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 154 root->log_start_pid = current->pid; 155 } else if (root->log_start_pid != current->pid) { 156 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 157 } 158 } else { 159 mutex_lock(&fs_info->tree_log_mutex); 160 if (!fs_info->log_root_tree) 161 ret = btrfs_init_log_root_tree(trans, fs_info); 162 mutex_unlock(&fs_info->tree_log_mutex); 163 if (ret) 164 goto out; 165 166 ret = btrfs_add_log_tree(trans, root); 167 if (ret) 168 goto out; 169 170 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 171 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 172 root->log_start_pid = current->pid; 173 } 174 175 atomic_inc(&root->log_batch); 176 atomic_inc(&root->log_writers); 177 if (ctx && !ctx->logging_new_name) { 178 int index = root->log_transid % 2; 179 list_add_tail(&ctx->list, &root->log_ctxs[index]); 180 ctx->log_transid = root->log_transid; 181 } 182 183out: 184 mutex_unlock(&root->log_mutex); 185 return ret; 186} 187 188/* 189 * returns 0 if there was a log transaction running and we were able 190 * to join, or returns -ENOENT if there were not transactions 191 * in progress 192 */ 193static int join_running_log_trans(struct btrfs_root *root) 194{ 195 int ret = -ENOENT; 196 197 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 198 return ret; 199 200 mutex_lock(&root->log_mutex); 201 if (root->log_root) { 202 ret = 0; 203 atomic_inc(&root->log_writers); 204 } 205 mutex_unlock(&root->log_mutex); 206 return ret; 207} 208 209/* 210 * This either makes the current running log transaction wait 211 * until you call btrfs_end_log_trans() or it makes any future 212 * log transactions wait until you call btrfs_end_log_trans() 213 */ 214void btrfs_pin_log_trans(struct btrfs_root *root) 215{ 216 atomic_inc(&root->log_writers); 217} 218 219/* 220 * indicate we're done making changes to the log tree 221 * and wake up anyone waiting to do a sync 222 */ 223void btrfs_end_log_trans(struct btrfs_root *root) 224{ 225 if (atomic_dec_and_test(&root->log_writers)) { 226 /* atomic_dec_and_test implies a barrier */ 227 cond_wake_up_nomb(&root->log_writer_wait); 228 } 229} 230 231static int btrfs_write_tree_block(struct extent_buffer *buf) 232{ 233 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 234 buf->start + buf->len - 1); 235} 236 237static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 238{ 239 filemap_fdatawait_range(buf->pages[0]->mapping, 240 buf->start, buf->start + buf->len - 1); 241} 242 243/* 244 * the walk control struct is used to pass state down the chain when 245 * processing the log tree. The stage field tells us which part 246 * of the log tree processing we are currently doing. The others 247 * are state fields used for that specific part 248 */ 249struct walk_control { 250 /* should we free the extent on disk when done? This is used 251 * at transaction commit time while freeing a log tree 252 */ 253 int free; 254 255 /* should we write out the extent buffer? This is used 256 * while flushing the log tree to disk during a sync 257 */ 258 int write; 259 260 /* should we wait for the extent buffer io to finish? Also used 261 * while flushing the log tree to disk for a sync 262 */ 263 int wait; 264 265 /* pin only walk, we record which extents on disk belong to the 266 * log trees 267 */ 268 int pin; 269 270 /* what stage of the replay code we're currently in */ 271 int stage; 272 273 /* 274 * Ignore any items from the inode currently being processed. Needs 275 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 276 * the LOG_WALK_REPLAY_INODES stage. 277 */ 278 bool ignore_cur_inode; 279 280 /* the root we are currently replaying */ 281 struct btrfs_root *replay_dest; 282 283 /* the trans handle for the current replay */ 284 struct btrfs_trans_handle *trans; 285 286 /* the function that gets used to process blocks we find in the 287 * tree. Note the extent_buffer might not be up to date when it is 288 * passed in, and it must be checked or read if you need the data 289 * inside it 290 */ 291 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 292 struct walk_control *wc, u64 gen, int level); 293}; 294 295/* 296 * process_func used to pin down extents, write them or wait on them 297 */ 298static int process_one_buffer(struct btrfs_root *log, 299 struct extent_buffer *eb, 300 struct walk_control *wc, u64 gen, int level) 301{ 302 struct btrfs_fs_info *fs_info = log->fs_info; 303 int ret = 0; 304 305 /* 306 * If this fs is mixed then we need to be able to process the leaves to 307 * pin down any logged extents, so we have to read the block. 308 */ 309 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 310 ret = btrfs_read_buffer(eb, gen, level, NULL); 311 if (ret) 312 return ret; 313 } 314 315 if (wc->pin) 316 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 317 eb->len); 318 319 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 320 if (wc->pin && btrfs_header_level(eb) == 0) 321 ret = btrfs_exclude_logged_extents(eb); 322 if (wc->write) 323 btrfs_write_tree_block(eb); 324 if (wc->wait) 325 btrfs_wait_tree_block_writeback(eb); 326 } 327 return ret; 328} 329 330/* 331 * Item overwrite used by replay and tree logging. eb, slot and key all refer 332 * to the src data we are copying out. 333 * 334 * root is the tree we are copying into, and path is a scratch 335 * path for use in this function (it should be released on entry and 336 * will be released on exit). 337 * 338 * If the key is already in the destination tree the existing item is 339 * overwritten. If the existing item isn't big enough, it is extended. 340 * If it is too large, it is truncated. 341 * 342 * If the key isn't in the destination yet, a new item is inserted. 343 */ 344static noinline int overwrite_item(struct btrfs_trans_handle *trans, 345 struct btrfs_root *root, 346 struct btrfs_path *path, 347 struct extent_buffer *eb, int slot, 348 struct btrfs_key *key) 349{ 350 int ret; 351 u32 item_size; 352 u64 saved_i_size = 0; 353 int save_old_i_size = 0; 354 unsigned long src_ptr; 355 unsigned long dst_ptr; 356 int overwrite_root = 0; 357 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 358 359 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 360 overwrite_root = 1; 361 362 item_size = btrfs_item_size_nr(eb, slot); 363 src_ptr = btrfs_item_ptr_offset(eb, slot); 364 365 /* look for the key in the destination tree */ 366 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 367 if (ret < 0) 368 return ret; 369 370 if (ret == 0) { 371 char *src_copy; 372 char *dst_copy; 373 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 374 path->slots[0]); 375 if (dst_size != item_size) 376 goto insert; 377 378 if (item_size == 0) { 379 btrfs_release_path(path); 380 return 0; 381 } 382 dst_copy = kmalloc(item_size, GFP_NOFS); 383 src_copy = kmalloc(item_size, GFP_NOFS); 384 if (!dst_copy || !src_copy) { 385 btrfs_release_path(path); 386 kfree(dst_copy); 387 kfree(src_copy); 388 return -ENOMEM; 389 } 390 391 read_extent_buffer(eb, src_copy, src_ptr, item_size); 392 393 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 394 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 395 item_size); 396 ret = memcmp(dst_copy, src_copy, item_size); 397 398 kfree(dst_copy); 399 kfree(src_copy); 400 /* 401 * they have the same contents, just return, this saves 402 * us from cowing blocks in the destination tree and doing 403 * extra writes that may not have been done by a previous 404 * sync 405 */ 406 if (ret == 0) { 407 btrfs_release_path(path); 408 return 0; 409 } 410 411 /* 412 * We need to load the old nbytes into the inode so when we 413 * replay the extents we've logged we get the right nbytes. 414 */ 415 if (inode_item) { 416 struct btrfs_inode_item *item; 417 u64 nbytes; 418 u32 mode; 419 420 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 421 struct btrfs_inode_item); 422 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 423 item = btrfs_item_ptr(eb, slot, 424 struct btrfs_inode_item); 425 btrfs_set_inode_nbytes(eb, item, nbytes); 426 427 /* 428 * If this is a directory we need to reset the i_size to 429 * 0 so that we can set it up properly when replaying 430 * the rest of the items in this log. 431 */ 432 mode = btrfs_inode_mode(eb, item); 433 if (S_ISDIR(mode)) 434 btrfs_set_inode_size(eb, item, 0); 435 } 436 } else if (inode_item) { 437 struct btrfs_inode_item *item; 438 u32 mode; 439 440 /* 441 * New inode, set nbytes to 0 so that the nbytes comes out 442 * properly when we replay the extents. 443 */ 444 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 445 btrfs_set_inode_nbytes(eb, item, 0); 446 447 /* 448 * If this is a directory we need to reset the i_size to 0 so 449 * that we can set it up properly when replaying the rest of 450 * the items in this log. 451 */ 452 mode = btrfs_inode_mode(eb, item); 453 if (S_ISDIR(mode)) 454 btrfs_set_inode_size(eb, item, 0); 455 } 456insert: 457 btrfs_release_path(path); 458 /* try to insert the key into the destination tree */ 459 path->skip_release_on_error = 1; 460 ret = btrfs_insert_empty_item(trans, root, path, 461 key, item_size); 462 path->skip_release_on_error = 0; 463 464 /* make sure any existing item is the correct size */ 465 if (ret == -EEXIST || ret == -EOVERFLOW) { 466 u32 found_size; 467 found_size = btrfs_item_size_nr(path->nodes[0], 468 path->slots[0]); 469 if (found_size > item_size) 470 btrfs_truncate_item(path, item_size, 1); 471 else if (found_size < item_size) 472 btrfs_extend_item(path, item_size - found_size); 473 } else if (ret) { 474 return ret; 475 } 476 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 477 path->slots[0]); 478 479 /* don't overwrite an existing inode if the generation number 480 * was logged as zero. This is done when the tree logging code 481 * is just logging an inode to make sure it exists after recovery. 482 * 483 * Also, don't overwrite i_size on directories during replay. 484 * log replay inserts and removes directory items based on the 485 * state of the tree found in the subvolume, and i_size is modified 486 * as it goes 487 */ 488 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 489 struct btrfs_inode_item *src_item; 490 struct btrfs_inode_item *dst_item; 491 492 src_item = (struct btrfs_inode_item *)src_ptr; 493 dst_item = (struct btrfs_inode_item *)dst_ptr; 494 495 if (btrfs_inode_generation(eb, src_item) == 0) { 496 struct extent_buffer *dst_eb = path->nodes[0]; 497 const u64 ino_size = btrfs_inode_size(eb, src_item); 498 499 /* 500 * For regular files an ino_size == 0 is used only when 501 * logging that an inode exists, as part of a directory 502 * fsync, and the inode wasn't fsynced before. In this 503 * case don't set the size of the inode in the fs/subvol 504 * tree, otherwise we would be throwing valid data away. 505 */ 506 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 507 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 508 ino_size != 0) 509 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 510 goto no_copy; 511 } 512 513 if (overwrite_root && 514 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 515 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 516 save_old_i_size = 1; 517 saved_i_size = btrfs_inode_size(path->nodes[0], 518 dst_item); 519 } 520 } 521 522 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 523 src_ptr, item_size); 524 525 if (save_old_i_size) { 526 struct btrfs_inode_item *dst_item; 527 dst_item = (struct btrfs_inode_item *)dst_ptr; 528 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 529 } 530 531 /* make sure the generation is filled in */ 532 if (key->type == BTRFS_INODE_ITEM_KEY) { 533 struct btrfs_inode_item *dst_item; 534 dst_item = (struct btrfs_inode_item *)dst_ptr; 535 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 536 btrfs_set_inode_generation(path->nodes[0], dst_item, 537 trans->transid); 538 } 539 } 540no_copy: 541 btrfs_mark_buffer_dirty(path->nodes[0]); 542 btrfs_release_path(path); 543 return 0; 544} 545 546/* 547 * simple helper to read an inode off the disk from a given root 548 * This can only be called for subvolume roots and not for the log 549 */ 550static noinline struct inode *read_one_inode(struct btrfs_root *root, 551 u64 objectid) 552{ 553 struct inode *inode; 554 555 inode = btrfs_iget(root->fs_info->sb, objectid, root); 556 if (IS_ERR(inode)) 557 inode = NULL; 558 return inode; 559} 560 561/* replays a single extent in 'eb' at 'slot' with 'key' into the 562 * subvolume 'root'. path is released on entry and should be released 563 * on exit. 564 * 565 * extents in the log tree have not been allocated out of the extent 566 * tree yet. So, this completes the allocation, taking a reference 567 * as required if the extent already exists or creating a new extent 568 * if it isn't in the extent allocation tree yet. 569 * 570 * The extent is inserted into the file, dropping any existing extents 571 * from the file that overlap the new one. 572 */ 573static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 574 struct btrfs_root *root, 575 struct btrfs_path *path, 576 struct extent_buffer *eb, int slot, 577 struct btrfs_key *key) 578{ 579 struct btrfs_fs_info *fs_info = root->fs_info; 580 int found_type; 581 u64 extent_end; 582 u64 start = key->offset; 583 u64 nbytes = 0; 584 struct btrfs_file_extent_item *item; 585 struct inode *inode = NULL; 586 unsigned long size; 587 int ret = 0; 588 589 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 590 found_type = btrfs_file_extent_type(eb, item); 591 592 if (found_type == BTRFS_FILE_EXTENT_REG || 593 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 594 nbytes = btrfs_file_extent_num_bytes(eb, item); 595 extent_end = start + nbytes; 596 597 /* 598 * We don't add to the inodes nbytes if we are prealloc or a 599 * hole. 600 */ 601 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 602 nbytes = 0; 603 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 604 size = btrfs_file_extent_ram_bytes(eb, item); 605 nbytes = btrfs_file_extent_ram_bytes(eb, item); 606 extent_end = ALIGN(start + size, 607 fs_info->sectorsize); 608 } else { 609 ret = 0; 610 goto out; 611 } 612 613 inode = read_one_inode(root, key->objectid); 614 if (!inode) { 615 ret = -EIO; 616 goto out; 617 } 618 619 /* 620 * first check to see if we already have this extent in the 621 * file. This must be done before the btrfs_drop_extents run 622 * so we don't try to drop this extent. 623 */ 624 ret = btrfs_lookup_file_extent(trans, root, path, 625 btrfs_ino(BTRFS_I(inode)), start, 0); 626 627 if (ret == 0 && 628 (found_type == BTRFS_FILE_EXTENT_REG || 629 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 630 struct btrfs_file_extent_item cmp1; 631 struct btrfs_file_extent_item cmp2; 632 struct btrfs_file_extent_item *existing; 633 struct extent_buffer *leaf; 634 635 leaf = path->nodes[0]; 636 existing = btrfs_item_ptr(leaf, path->slots[0], 637 struct btrfs_file_extent_item); 638 639 read_extent_buffer(eb, &cmp1, (unsigned long)item, 640 sizeof(cmp1)); 641 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 642 sizeof(cmp2)); 643 644 /* 645 * we already have a pointer to this exact extent, 646 * we don't have to do anything 647 */ 648 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 649 btrfs_release_path(path); 650 goto out; 651 } 652 } 653 btrfs_release_path(path); 654 655 /* drop any overlapping extents */ 656 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 657 if (ret) 658 goto out; 659 660 if (found_type == BTRFS_FILE_EXTENT_REG || 661 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 662 u64 offset; 663 unsigned long dest_offset; 664 struct btrfs_key ins; 665 666 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 667 btrfs_fs_incompat(fs_info, NO_HOLES)) 668 goto update_inode; 669 670 ret = btrfs_insert_empty_item(trans, root, path, key, 671 sizeof(*item)); 672 if (ret) 673 goto out; 674 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 675 path->slots[0]); 676 copy_extent_buffer(path->nodes[0], eb, dest_offset, 677 (unsigned long)item, sizeof(*item)); 678 679 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 680 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 681 ins.type = BTRFS_EXTENT_ITEM_KEY; 682 offset = key->offset - btrfs_file_extent_offset(eb, item); 683 684 /* 685 * Manually record dirty extent, as here we did a shallow 686 * file extent item copy and skip normal backref update, 687 * but modifying extent tree all by ourselves. 688 * So need to manually record dirty extent for qgroup, 689 * as the owner of the file extent changed from log tree 690 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 691 */ 692 ret = btrfs_qgroup_trace_extent(trans, 693 btrfs_file_extent_disk_bytenr(eb, item), 694 btrfs_file_extent_disk_num_bytes(eb, item), 695 GFP_NOFS); 696 if (ret < 0) 697 goto out; 698 699 if (ins.objectid > 0) { 700 struct btrfs_ref ref = { 0 }; 701 u64 csum_start; 702 u64 csum_end; 703 LIST_HEAD(ordered_sums); 704 705 /* 706 * is this extent already allocated in the extent 707 * allocation tree? If so, just add a reference 708 */ 709 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 710 ins.offset); 711 if (ret < 0) { 712 goto out; 713 } else if (ret == 0) { 714 btrfs_init_generic_ref(&ref, 715 BTRFS_ADD_DELAYED_REF, 716 ins.objectid, ins.offset, 0); 717 btrfs_init_data_ref(&ref, 718 root->root_key.objectid, 719 key->objectid, offset); 720 ret = btrfs_inc_extent_ref(trans, &ref); 721 if (ret) 722 goto out; 723 } else { 724 /* 725 * insert the extent pointer in the extent 726 * allocation tree 727 */ 728 ret = btrfs_alloc_logged_file_extent(trans, 729 root->root_key.objectid, 730 key->objectid, offset, &ins); 731 if (ret) 732 goto out; 733 } 734 btrfs_release_path(path); 735 736 if (btrfs_file_extent_compression(eb, item)) { 737 csum_start = ins.objectid; 738 csum_end = csum_start + ins.offset; 739 } else { 740 csum_start = ins.objectid + 741 btrfs_file_extent_offset(eb, item); 742 csum_end = csum_start + 743 btrfs_file_extent_num_bytes(eb, item); 744 } 745 746 ret = btrfs_lookup_csums_range(root->log_root, 747 csum_start, csum_end - 1, 748 &ordered_sums, 0); 749 if (ret) 750 goto out; 751 /* 752 * Now delete all existing cums in the csum root that 753 * cover our range. We do this because we can have an 754 * extent that is completely referenced by one file 755 * extent item and partially referenced by another 756 * file extent item (like after using the clone or 757 * extent_same ioctls). In this case if we end up doing 758 * the replay of the one that partially references the 759 * extent first, and we do not do the csum deletion 760 * below, we can get 2 csum items in the csum tree that 761 * overlap each other. For example, imagine our log has 762 * the two following file extent items: 763 * 764 * key (257 EXTENT_DATA 409600) 765 * extent data disk byte 12845056 nr 102400 766 * extent data offset 20480 nr 20480 ram 102400 767 * 768 * key (257 EXTENT_DATA 819200) 769 * extent data disk byte 12845056 nr 102400 770 * extent data offset 0 nr 102400 ram 102400 771 * 772 * Where the second one fully references the 100K extent 773 * that starts at disk byte 12845056, and the log tree 774 * has a single csum item that covers the entire range 775 * of the extent: 776 * 777 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 778 * 779 * After the first file extent item is replayed, the 780 * csum tree gets the following csum item: 781 * 782 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 783 * 784 * Which covers the 20K sub-range starting at offset 20K 785 * of our extent. Now when we replay the second file 786 * extent item, if we do not delete existing csum items 787 * that cover any of its blocks, we end up getting two 788 * csum items in our csum tree that overlap each other: 789 * 790 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 791 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 792 * 793 * Which is a problem, because after this anyone trying 794 * to lookup up for the checksum of any block of our 795 * extent starting at an offset of 40K or higher, will 796 * end up looking at the second csum item only, which 797 * does not contain the checksum for any block starting 798 * at offset 40K or higher of our extent. 799 */ 800 while (!list_empty(&ordered_sums)) { 801 struct btrfs_ordered_sum *sums; 802 sums = list_entry(ordered_sums.next, 803 struct btrfs_ordered_sum, 804 list); 805 if (!ret) 806 ret = btrfs_del_csums(trans, 807 fs_info->csum_root, 808 sums->bytenr, 809 sums->len); 810 if (!ret) 811 ret = btrfs_csum_file_blocks(trans, 812 fs_info->csum_root, sums); 813 list_del(&sums->list); 814 kfree(sums); 815 } 816 if (ret) 817 goto out; 818 } else { 819 btrfs_release_path(path); 820 } 821 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 822 /* inline extents are easy, we just overwrite them */ 823 ret = overwrite_item(trans, root, path, eb, slot, key); 824 if (ret) 825 goto out; 826 } 827 828 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 829 extent_end - start); 830 if (ret) 831 goto out; 832 833 inode_add_bytes(inode, nbytes); 834update_inode: 835 ret = btrfs_update_inode(trans, root, inode); 836out: 837 if (inode) 838 iput(inode); 839 return ret; 840} 841 842/* 843 * when cleaning up conflicts between the directory names in the 844 * subvolume, directory names in the log and directory names in the 845 * inode back references, we may have to unlink inodes from directories. 846 * 847 * This is a helper function to do the unlink of a specific directory 848 * item 849 */ 850static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 851 struct btrfs_root *root, 852 struct btrfs_path *path, 853 struct btrfs_inode *dir, 854 struct btrfs_dir_item *di) 855{ 856 struct inode *inode; 857 char *name; 858 int name_len; 859 struct extent_buffer *leaf; 860 struct btrfs_key location; 861 int ret; 862 863 leaf = path->nodes[0]; 864 865 btrfs_dir_item_key_to_cpu(leaf, di, &location); 866 name_len = btrfs_dir_name_len(leaf, di); 867 name = kmalloc(name_len, GFP_NOFS); 868 if (!name) 869 return -ENOMEM; 870 871 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 872 btrfs_release_path(path); 873 874 inode = read_one_inode(root, location.objectid); 875 if (!inode) { 876 ret = -EIO; 877 goto out; 878 } 879 880 ret = link_to_fixup_dir(trans, root, path, location.objectid); 881 if (ret) 882 goto out; 883 884 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 885 name_len); 886 if (ret) 887 goto out; 888 else 889 ret = btrfs_run_delayed_items(trans); 890out: 891 kfree(name); 892 iput(inode); 893 return ret; 894} 895 896/* 897 * See if a given name and sequence number found in an inode back reference are 898 * already in a directory and correctly point to this inode. 899 * 900 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 901 * exists. 902 */ 903static noinline int inode_in_dir(struct btrfs_root *root, 904 struct btrfs_path *path, 905 u64 dirid, u64 objectid, u64 index, 906 const char *name, int name_len) 907{ 908 struct btrfs_dir_item *di; 909 struct btrfs_key location; 910 int ret = 0; 911 912 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 913 index, name, name_len, 0); 914 if (IS_ERR(di)) { 915 if (PTR_ERR(di) != -ENOENT) 916 ret = PTR_ERR(di); 917 goto out; 918 } else if (di) { 919 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 920 if (location.objectid != objectid) 921 goto out; 922 } else { 923 goto out; 924 } 925 926 btrfs_release_path(path); 927 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 928 if (IS_ERR(di)) { 929 ret = PTR_ERR(di); 930 goto out; 931 } else if (di) { 932 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 933 if (location.objectid == objectid) 934 ret = 1; 935 } 936out: 937 btrfs_release_path(path); 938 return ret; 939} 940 941/* 942 * helper function to check a log tree for a named back reference in 943 * an inode. This is used to decide if a back reference that is 944 * found in the subvolume conflicts with what we find in the log. 945 * 946 * inode backreferences may have multiple refs in a single item, 947 * during replay we process one reference at a time, and we don't 948 * want to delete valid links to a file from the subvolume if that 949 * link is also in the log. 950 */ 951static noinline int backref_in_log(struct btrfs_root *log, 952 struct btrfs_key *key, 953 u64 ref_objectid, 954 const char *name, int namelen) 955{ 956 struct btrfs_path *path; 957 int ret; 958 959 path = btrfs_alloc_path(); 960 if (!path) 961 return -ENOMEM; 962 963 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 964 if (ret < 0) { 965 goto out; 966 } else if (ret == 1) { 967 ret = 0; 968 goto out; 969 } 970 971 if (key->type == BTRFS_INODE_EXTREF_KEY) 972 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 973 path->slots[0], 974 ref_objectid, 975 name, namelen); 976 else 977 ret = !!btrfs_find_name_in_backref(path->nodes[0], 978 path->slots[0], 979 name, namelen); 980out: 981 btrfs_free_path(path); 982 return ret; 983} 984 985static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 986 struct btrfs_root *root, 987 struct btrfs_path *path, 988 struct btrfs_root *log_root, 989 struct btrfs_inode *dir, 990 struct btrfs_inode *inode, 991 u64 inode_objectid, u64 parent_objectid, 992 u64 ref_index, char *name, int namelen, 993 int *search_done) 994{ 995 int ret; 996 char *victim_name; 997 int victim_name_len; 998 struct extent_buffer *leaf; 999 struct btrfs_dir_item *di; 1000 struct btrfs_key search_key; 1001 struct btrfs_inode_extref *extref; 1002 1003again: 1004 /* Search old style refs */ 1005 search_key.objectid = inode_objectid; 1006 search_key.type = BTRFS_INODE_REF_KEY; 1007 search_key.offset = parent_objectid; 1008 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1009 if (ret == 0) { 1010 struct btrfs_inode_ref *victim_ref; 1011 unsigned long ptr; 1012 unsigned long ptr_end; 1013 1014 leaf = path->nodes[0]; 1015 1016 /* are we trying to overwrite a back ref for the root directory 1017 * if so, just jump out, we're done 1018 */ 1019 if (search_key.objectid == search_key.offset) 1020 return 1; 1021 1022 /* check all the names in this back reference to see 1023 * if they are in the log. if so, we allow them to stay 1024 * otherwise they must be unlinked as a conflict 1025 */ 1026 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1027 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1028 while (ptr < ptr_end) { 1029 victim_ref = (struct btrfs_inode_ref *)ptr; 1030 victim_name_len = btrfs_inode_ref_name_len(leaf, 1031 victim_ref); 1032 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1033 if (!victim_name) 1034 return -ENOMEM; 1035 1036 read_extent_buffer(leaf, victim_name, 1037 (unsigned long)(victim_ref + 1), 1038 victim_name_len); 1039 1040 ret = backref_in_log(log_root, &search_key, 1041 parent_objectid, victim_name, 1042 victim_name_len); 1043 if (ret < 0) { 1044 kfree(victim_name); 1045 return ret; 1046 } else if (!ret) { 1047 inc_nlink(&inode->vfs_inode); 1048 btrfs_release_path(path); 1049 1050 ret = btrfs_unlink_inode(trans, root, dir, inode, 1051 victim_name, victim_name_len); 1052 kfree(victim_name); 1053 if (ret) 1054 return ret; 1055 ret = btrfs_run_delayed_items(trans); 1056 if (ret) 1057 return ret; 1058 *search_done = 1; 1059 goto again; 1060 } 1061 kfree(victim_name); 1062 1063 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1064 } 1065 1066 /* 1067 * NOTE: we have searched root tree and checked the 1068 * corresponding ref, it does not need to check again. 1069 */ 1070 *search_done = 1; 1071 } 1072 btrfs_release_path(path); 1073 1074 /* Same search but for extended refs */ 1075 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1076 inode_objectid, parent_objectid, 0, 1077 0); 1078 if (IS_ERR(extref)) { 1079 return PTR_ERR(extref); 1080 } else if (extref) { 1081 u32 item_size; 1082 u32 cur_offset = 0; 1083 unsigned long base; 1084 struct inode *victim_parent; 1085 1086 leaf = path->nodes[0]; 1087 1088 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1089 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1090 1091 while (cur_offset < item_size) { 1092 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1093 1094 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1095 1096 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1097 goto next; 1098 1099 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1100 if (!victim_name) 1101 return -ENOMEM; 1102 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1103 victim_name_len); 1104 1105 search_key.objectid = inode_objectid; 1106 search_key.type = BTRFS_INODE_EXTREF_KEY; 1107 search_key.offset = btrfs_extref_hash(parent_objectid, 1108 victim_name, 1109 victim_name_len); 1110 ret = backref_in_log(log_root, &search_key, 1111 parent_objectid, victim_name, 1112 victim_name_len); 1113 if (ret < 0) { 1114 kfree(victim_name); 1115 return ret; 1116 } else if (!ret) { 1117 ret = -ENOENT; 1118 victim_parent = read_one_inode(root, 1119 parent_objectid); 1120 if (victim_parent) { 1121 inc_nlink(&inode->vfs_inode); 1122 btrfs_release_path(path); 1123 1124 ret = btrfs_unlink_inode(trans, root, 1125 BTRFS_I(victim_parent), 1126 inode, 1127 victim_name, 1128 victim_name_len); 1129 if (!ret) 1130 ret = btrfs_run_delayed_items( 1131 trans); 1132 } 1133 iput(victim_parent); 1134 kfree(victim_name); 1135 if (ret) 1136 return ret; 1137 *search_done = 1; 1138 goto again; 1139 } 1140 kfree(victim_name); 1141next: 1142 cur_offset += victim_name_len + sizeof(*extref); 1143 } 1144 *search_done = 1; 1145 } 1146 btrfs_release_path(path); 1147 1148 /* look for a conflicting sequence number */ 1149 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1150 ref_index, name, namelen, 0); 1151 if (IS_ERR(di)) { 1152 if (PTR_ERR(di) != -ENOENT) 1153 return PTR_ERR(di); 1154 } else if (di) { 1155 ret = drop_one_dir_item(trans, root, path, dir, di); 1156 if (ret) 1157 return ret; 1158 } 1159 btrfs_release_path(path); 1160 1161 /* look for a conflicting name */ 1162 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1163 name, namelen, 0); 1164 if (IS_ERR(di)) { 1165 return PTR_ERR(di); 1166 } else if (di) { 1167 ret = drop_one_dir_item(trans, root, path, dir, di); 1168 if (ret) 1169 return ret; 1170 } 1171 btrfs_release_path(path); 1172 1173 return 0; 1174} 1175 1176static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1177 u32 *namelen, char **name, u64 *index, 1178 u64 *parent_objectid) 1179{ 1180 struct btrfs_inode_extref *extref; 1181 1182 extref = (struct btrfs_inode_extref *)ref_ptr; 1183 1184 *namelen = btrfs_inode_extref_name_len(eb, extref); 1185 *name = kmalloc(*namelen, GFP_NOFS); 1186 if (*name == NULL) 1187 return -ENOMEM; 1188 1189 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1190 *namelen); 1191 1192 if (index) 1193 *index = btrfs_inode_extref_index(eb, extref); 1194 if (parent_objectid) 1195 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1196 1197 return 0; 1198} 1199 1200static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1201 u32 *namelen, char **name, u64 *index) 1202{ 1203 struct btrfs_inode_ref *ref; 1204 1205 ref = (struct btrfs_inode_ref *)ref_ptr; 1206 1207 *namelen = btrfs_inode_ref_name_len(eb, ref); 1208 *name = kmalloc(*namelen, GFP_NOFS); 1209 if (*name == NULL) 1210 return -ENOMEM; 1211 1212 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1213 1214 if (index) 1215 *index = btrfs_inode_ref_index(eb, ref); 1216 1217 return 0; 1218} 1219 1220/* 1221 * Take an inode reference item from the log tree and iterate all names from the 1222 * inode reference item in the subvolume tree with the same key (if it exists). 1223 * For any name that is not in the inode reference item from the log tree, do a 1224 * proper unlink of that name (that is, remove its entry from the inode 1225 * reference item and both dir index keys). 1226 */ 1227static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1228 struct btrfs_root *root, 1229 struct btrfs_path *path, 1230 struct btrfs_inode *inode, 1231 struct extent_buffer *log_eb, 1232 int log_slot, 1233 struct btrfs_key *key) 1234{ 1235 int ret; 1236 unsigned long ref_ptr; 1237 unsigned long ref_end; 1238 struct extent_buffer *eb; 1239 1240again: 1241 btrfs_release_path(path); 1242 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1243 if (ret > 0) { 1244 ret = 0; 1245 goto out; 1246 } 1247 if (ret < 0) 1248 goto out; 1249 1250 eb = path->nodes[0]; 1251 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1252 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1253 while (ref_ptr < ref_end) { 1254 char *name = NULL; 1255 int namelen; 1256 u64 parent_id; 1257 1258 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1259 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1260 NULL, &parent_id); 1261 } else { 1262 parent_id = key->offset; 1263 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1264 NULL); 1265 } 1266 if (ret) 1267 goto out; 1268 1269 if (key->type == BTRFS_INODE_EXTREF_KEY) 1270 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1271 parent_id, name, 1272 namelen); 1273 else 1274 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1275 name, namelen); 1276 1277 if (!ret) { 1278 struct inode *dir; 1279 1280 btrfs_release_path(path); 1281 dir = read_one_inode(root, parent_id); 1282 if (!dir) { 1283 ret = -ENOENT; 1284 kfree(name); 1285 goto out; 1286 } 1287 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1288 inode, name, namelen); 1289 kfree(name); 1290 iput(dir); 1291 /* 1292 * Whenever we need to check if a name exists or not, we 1293 * check the subvolume tree. So after an unlink we must 1294 * run delayed items, so that future checks for a name 1295 * during log replay see that the name does not exists 1296 * anymore. 1297 */ 1298 if (!ret) 1299 ret = btrfs_run_delayed_items(trans); 1300 if (ret) 1301 goto out; 1302 goto again; 1303 } 1304 1305 kfree(name); 1306 ref_ptr += namelen; 1307 if (key->type == BTRFS_INODE_EXTREF_KEY) 1308 ref_ptr += sizeof(struct btrfs_inode_extref); 1309 else 1310 ref_ptr += sizeof(struct btrfs_inode_ref); 1311 } 1312 ret = 0; 1313 out: 1314 btrfs_release_path(path); 1315 return ret; 1316} 1317 1318static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1319 const u8 ref_type, const char *name, 1320 const int namelen) 1321{ 1322 struct btrfs_key key; 1323 struct btrfs_path *path; 1324 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1325 int ret; 1326 1327 path = btrfs_alloc_path(); 1328 if (!path) 1329 return -ENOMEM; 1330 1331 key.objectid = btrfs_ino(BTRFS_I(inode)); 1332 key.type = ref_type; 1333 if (key.type == BTRFS_INODE_REF_KEY) 1334 key.offset = parent_id; 1335 else 1336 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1337 1338 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1339 if (ret < 0) 1340 goto out; 1341 if (ret > 0) { 1342 ret = 0; 1343 goto out; 1344 } 1345 if (key.type == BTRFS_INODE_EXTREF_KEY) 1346 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1347 path->slots[0], parent_id, name, namelen); 1348 else 1349 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1350 name, namelen); 1351 1352out: 1353 btrfs_free_path(path); 1354 return ret; 1355} 1356 1357static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1358 struct inode *dir, struct inode *inode, const char *name, 1359 int namelen, u64 ref_index) 1360{ 1361 struct btrfs_dir_item *dir_item; 1362 struct btrfs_key key; 1363 struct btrfs_path *path; 1364 struct inode *other_inode = NULL; 1365 int ret; 1366 1367 path = btrfs_alloc_path(); 1368 if (!path) 1369 return -ENOMEM; 1370 1371 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1372 btrfs_ino(BTRFS_I(dir)), 1373 name, namelen, 0); 1374 if (!dir_item) { 1375 btrfs_release_path(path); 1376 goto add_link; 1377 } else if (IS_ERR(dir_item)) { 1378 ret = PTR_ERR(dir_item); 1379 goto out; 1380 } 1381 1382 /* 1383 * Our inode's dentry collides with the dentry of another inode which is 1384 * in the log but not yet processed since it has a higher inode number. 1385 * So delete that other dentry. 1386 */ 1387 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1388 btrfs_release_path(path); 1389 other_inode = read_one_inode(root, key.objectid); 1390 if (!other_inode) { 1391 ret = -ENOENT; 1392 goto out; 1393 } 1394 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1395 name, namelen); 1396 if (ret) 1397 goto out; 1398 /* 1399 * If we dropped the link count to 0, bump it so that later the iput() 1400 * on the inode will not free it. We will fixup the link count later. 1401 */ 1402 if (other_inode->i_nlink == 0) 1403 inc_nlink(other_inode); 1404 1405 ret = btrfs_run_delayed_items(trans); 1406 if (ret) 1407 goto out; 1408add_link: 1409 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1410 name, namelen, 0, ref_index); 1411out: 1412 iput(other_inode); 1413 btrfs_free_path(path); 1414 1415 return ret; 1416} 1417 1418/* 1419 * replay one inode back reference item found in the log tree. 1420 * eb, slot and key refer to the buffer and key found in the log tree. 1421 * root is the destination we are replaying into, and path is for temp 1422 * use by this function. (it should be released on return). 1423 */ 1424static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1425 struct btrfs_root *root, 1426 struct btrfs_root *log, 1427 struct btrfs_path *path, 1428 struct extent_buffer *eb, int slot, 1429 struct btrfs_key *key) 1430{ 1431 struct inode *dir = NULL; 1432 struct inode *inode = NULL; 1433 unsigned long ref_ptr; 1434 unsigned long ref_end; 1435 char *name = NULL; 1436 int namelen; 1437 int ret; 1438 int search_done = 0; 1439 int log_ref_ver = 0; 1440 u64 parent_objectid; 1441 u64 inode_objectid; 1442 u64 ref_index = 0; 1443 int ref_struct_size; 1444 1445 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1446 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1447 1448 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1449 struct btrfs_inode_extref *r; 1450 1451 ref_struct_size = sizeof(struct btrfs_inode_extref); 1452 log_ref_ver = 1; 1453 r = (struct btrfs_inode_extref *)ref_ptr; 1454 parent_objectid = btrfs_inode_extref_parent(eb, r); 1455 } else { 1456 ref_struct_size = sizeof(struct btrfs_inode_ref); 1457 parent_objectid = key->offset; 1458 } 1459 inode_objectid = key->objectid; 1460 1461 /* 1462 * it is possible that we didn't log all the parent directories 1463 * for a given inode. If we don't find the dir, just don't 1464 * copy the back ref in. The link count fixup code will take 1465 * care of the rest 1466 */ 1467 dir = read_one_inode(root, parent_objectid); 1468 if (!dir) { 1469 ret = -ENOENT; 1470 goto out; 1471 } 1472 1473 inode = read_one_inode(root, inode_objectid); 1474 if (!inode) { 1475 ret = -EIO; 1476 goto out; 1477 } 1478 1479 while (ref_ptr < ref_end) { 1480 if (log_ref_ver) { 1481 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1482 &ref_index, &parent_objectid); 1483 /* 1484 * parent object can change from one array 1485 * item to another. 1486 */ 1487 if (!dir) 1488 dir = read_one_inode(root, parent_objectid); 1489 if (!dir) { 1490 ret = -ENOENT; 1491 goto out; 1492 } 1493 } else { 1494 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1495 &ref_index); 1496 } 1497 if (ret) 1498 goto out; 1499 1500 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1501 btrfs_ino(BTRFS_I(inode)), ref_index, 1502 name, namelen); 1503 if (ret < 0) { 1504 goto out; 1505 } else if (ret == 0) { 1506 /* 1507 * look for a conflicting back reference in the 1508 * metadata. if we find one we have to unlink that name 1509 * of the file before we add our new link. Later on, we 1510 * overwrite any existing back reference, and we don't 1511 * want to create dangling pointers in the directory. 1512 */ 1513 1514 if (!search_done) { 1515 ret = __add_inode_ref(trans, root, path, log, 1516 BTRFS_I(dir), 1517 BTRFS_I(inode), 1518 inode_objectid, 1519 parent_objectid, 1520 ref_index, name, namelen, 1521 &search_done); 1522 if (ret) { 1523 if (ret == 1) 1524 ret = 0; 1525 goto out; 1526 } 1527 } 1528 1529 /* 1530 * If a reference item already exists for this inode 1531 * with the same parent and name, but different index, 1532 * drop it and the corresponding directory index entries 1533 * from the parent before adding the new reference item 1534 * and dir index entries, otherwise we would fail with 1535 * -EEXIST returned from btrfs_add_link() below. 1536 */ 1537 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1538 name, namelen); 1539 if (ret > 0) { 1540 ret = btrfs_unlink_inode(trans, root, 1541 BTRFS_I(dir), 1542 BTRFS_I(inode), 1543 name, namelen); 1544 /* 1545 * If we dropped the link count to 0, bump it so 1546 * that later the iput() on the inode will not 1547 * free it. We will fixup the link count later. 1548 */ 1549 if (!ret && inode->i_nlink == 0) 1550 inc_nlink(inode); 1551 /* 1552 * Whenever we need to check if a name exists or 1553 * not, we check the subvolume tree. So after an 1554 * unlink we must run delayed items, so that future 1555 * checks for a name during log replay see that the 1556 * name does not exists anymore. 1557 */ 1558 if (!ret) 1559 ret = btrfs_run_delayed_items(trans); 1560 } 1561 if (ret < 0) 1562 goto out; 1563 1564 /* insert our name */ 1565 ret = add_link(trans, root, dir, inode, name, namelen, 1566 ref_index); 1567 if (ret) 1568 goto out; 1569 1570 btrfs_update_inode(trans, root, inode); 1571 } 1572 /* Else, ret == 1, we already have a perfect match, we're done. */ 1573 1574 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1575 kfree(name); 1576 name = NULL; 1577 if (log_ref_ver) { 1578 iput(dir); 1579 dir = NULL; 1580 } 1581 } 1582 1583 /* 1584 * Before we overwrite the inode reference item in the subvolume tree 1585 * with the item from the log tree, we must unlink all names from the 1586 * parent directory that are in the subvolume's tree inode reference 1587 * item, otherwise we end up with an inconsistent subvolume tree where 1588 * dir index entries exist for a name but there is no inode reference 1589 * item with the same name. 1590 */ 1591 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1592 key); 1593 if (ret) 1594 goto out; 1595 1596 /* finally write the back reference in the inode */ 1597 ret = overwrite_item(trans, root, path, eb, slot, key); 1598out: 1599 btrfs_release_path(path); 1600 kfree(name); 1601 iput(dir); 1602 iput(inode); 1603 return ret; 1604} 1605 1606static int insert_orphan_item(struct btrfs_trans_handle *trans, 1607 struct btrfs_root *root, u64 ino) 1608{ 1609 int ret; 1610 1611 ret = btrfs_insert_orphan_item(trans, root, ino); 1612 if (ret == -EEXIST) 1613 ret = 0; 1614 1615 return ret; 1616} 1617 1618static int count_inode_extrefs(struct btrfs_root *root, 1619 struct btrfs_inode *inode, struct btrfs_path *path) 1620{ 1621 int ret = 0; 1622 int name_len; 1623 unsigned int nlink = 0; 1624 u32 item_size; 1625 u32 cur_offset = 0; 1626 u64 inode_objectid = btrfs_ino(inode); 1627 u64 offset = 0; 1628 unsigned long ptr; 1629 struct btrfs_inode_extref *extref; 1630 struct extent_buffer *leaf; 1631 1632 while (1) { 1633 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1634 &extref, &offset); 1635 if (ret) 1636 break; 1637 1638 leaf = path->nodes[0]; 1639 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1640 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1641 cur_offset = 0; 1642 1643 while (cur_offset < item_size) { 1644 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1645 name_len = btrfs_inode_extref_name_len(leaf, extref); 1646 1647 nlink++; 1648 1649 cur_offset += name_len + sizeof(*extref); 1650 } 1651 1652 offset++; 1653 btrfs_release_path(path); 1654 } 1655 btrfs_release_path(path); 1656 1657 if (ret < 0 && ret != -ENOENT) 1658 return ret; 1659 return nlink; 1660} 1661 1662static int count_inode_refs(struct btrfs_root *root, 1663 struct btrfs_inode *inode, struct btrfs_path *path) 1664{ 1665 int ret; 1666 struct btrfs_key key; 1667 unsigned int nlink = 0; 1668 unsigned long ptr; 1669 unsigned long ptr_end; 1670 int name_len; 1671 u64 ino = btrfs_ino(inode); 1672 1673 key.objectid = ino; 1674 key.type = BTRFS_INODE_REF_KEY; 1675 key.offset = (u64)-1; 1676 1677 while (1) { 1678 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1679 if (ret < 0) 1680 break; 1681 if (ret > 0) { 1682 if (path->slots[0] == 0) 1683 break; 1684 path->slots[0]--; 1685 } 1686process_slot: 1687 btrfs_item_key_to_cpu(path->nodes[0], &key, 1688 path->slots[0]); 1689 if (key.objectid != ino || 1690 key.type != BTRFS_INODE_REF_KEY) 1691 break; 1692 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1693 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1694 path->slots[0]); 1695 while (ptr < ptr_end) { 1696 struct btrfs_inode_ref *ref; 1697 1698 ref = (struct btrfs_inode_ref *)ptr; 1699 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1700 ref); 1701 ptr = (unsigned long)(ref + 1) + name_len; 1702 nlink++; 1703 } 1704 1705 if (key.offset == 0) 1706 break; 1707 if (path->slots[0] > 0) { 1708 path->slots[0]--; 1709 goto process_slot; 1710 } 1711 key.offset--; 1712 btrfs_release_path(path); 1713 } 1714 btrfs_release_path(path); 1715 1716 return nlink; 1717} 1718 1719/* 1720 * There are a few corners where the link count of the file can't 1721 * be properly maintained during replay. So, instead of adding 1722 * lots of complexity to the log code, we just scan the backrefs 1723 * for any file that has been through replay. 1724 * 1725 * The scan will update the link count on the inode to reflect the 1726 * number of back refs found. If it goes down to zero, the iput 1727 * will free the inode. 1728 */ 1729static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1730 struct btrfs_root *root, 1731 struct inode *inode) 1732{ 1733 struct btrfs_path *path; 1734 int ret; 1735 u64 nlink = 0; 1736 u64 ino = btrfs_ino(BTRFS_I(inode)); 1737 1738 path = btrfs_alloc_path(); 1739 if (!path) 1740 return -ENOMEM; 1741 1742 ret = count_inode_refs(root, BTRFS_I(inode), path); 1743 if (ret < 0) 1744 goto out; 1745 1746 nlink = ret; 1747 1748 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1749 if (ret < 0) 1750 goto out; 1751 1752 nlink += ret; 1753 1754 ret = 0; 1755 1756 if (nlink != inode->i_nlink) { 1757 set_nlink(inode, nlink); 1758 btrfs_update_inode(trans, root, inode); 1759 } 1760 BTRFS_I(inode)->index_cnt = (u64)-1; 1761 1762 if (inode->i_nlink == 0) { 1763 if (S_ISDIR(inode->i_mode)) { 1764 ret = replay_dir_deletes(trans, root, NULL, path, 1765 ino, 1); 1766 if (ret) 1767 goto out; 1768 } 1769 ret = insert_orphan_item(trans, root, ino); 1770 } 1771 1772out: 1773 btrfs_free_path(path); 1774 return ret; 1775} 1776 1777static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1778 struct btrfs_root *root, 1779 struct btrfs_path *path) 1780{ 1781 int ret; 1782 struct btrfs_key key; 1783 struct inode *inode; 1784 1785 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1786 key.type = BTRFS_ORPHAN_ITEM_KEY; 1787 key.offset = (u64)-1; 1788 while (1) { 1789 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1790 if (ret < 0) 1791 break; 1792 1793 if (ret == 1) { 1794 ret = 0; 1795 if (path->slots[0] == 0) 1796 break; 1797 path->slots[0]--; 1798 } 1799 1800 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1801 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1802 key.type != BTRFS_ORPHAN_ITEM_KEY) 1803 break; 1804 1805 ret = btrfs_del_item(trans, root, path); 1806 if (ret) 1807 break; 1808 1809 btrfs_release_path(path); 1810 inode = read_one_inode(root, key.offset); 1811 if (!inode) { 1812 ret = -EIO; 1813 break; 1814 } 1815 1816 ret = fixup_inode_link_count(trans, root, inode); 1817 iput(inode); 1818 if (ret) 1819 break; 1820 1821 /* 1822 * fixup on a directory may create new entries, 1823 * make sure we always look for the highset possible 1824 * offset 1825 */ 1826 key.offset = (u64)-1; 1827 } 1828 btrfs_release_path(path); 1829 return ret; 1830} 1831 1832 1833/* 1834 * record a given inode in the fixup dir so we can check its link 1835 * count when replay is done. The link count is incremented here 1836 * so the inode won't go away until we check it 1837 */ 1838static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1839 struct btrfs_root *root, 1840 struct btrfs_path *path, 1841 u64 objectid) 1842{ 1843 struct btrfs_key key; 1844 int ret = 0; 1845 struct inode *inode; 1846 1847 inode = read_one_inode(root, objectid); 1848 if (!inode) 1849 return -EIO; 1850 1851 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1852 key.type = BTRFS_ORPHAN_ITEM_KEY; 1853 key.offset = objectid; 1854 1855 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1856 1857 btrfs_release_path(path); 1858 if (ret == 0) { 1859 if (!inode->i_nlink) 1860 set_nlink(inode, 1); 1861 else 1862 inc_nlink(inode); 1863 ret = btrfs_update_inode(trans, root, inode); 1864 } else if (ret == -EEXIST) { 1865 ret = 0; 1866 } 1867 iput(inode); 1868 1869 return ret; 1870} 1871 1872/* 1873 * when replaying the log for a directory, we only insert names 1874 * for inodes that actually exist. This means an fsync on a directory 1875 * does not implicitly fsync all the new files in it 1876 */ 1877static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1878 struct btrfs_root *root, 1879 u64 dirid, u64 index, 1880 char *name, int name_len, 1881 struct btrfs_key *location) 1882{ 1883 struct inode *inode; 1884 struct inode *dir; 1885 int ret; 1886 1887 inode = read_one_inode(root, location->objectid); 1888 if (!inode) 1889 return -ENOENT; 1890 1891 dir = read_one_inode(root, dirid); 1892 if (!dir) { 1893 iput(inode); 1894 return -EIO; 1895 } 1896 1897 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1898 name_len, 1, index); 1899 1900 /* FIXME, put inode into FIXUP list */ 1901 1902 iput(inode); 1903 iput(dir); 1904 return ret; 1905} 1906 1907/* 1908 * take a single entry in a log directory item and replay it into 1909 * the subvolume. 1910 * 1911 * if a conflicting item exists in the subdirectory already, 1912 * the inode it points to is unlinked and put into the link count 1913 * fix up tree. 1914 * 1915 * If a name from the log points to a file or directory that does 1916 * not exist in the FS, it is skipped. fsyncs on directories 1917 * do not force down inodes inside that directory, just changes to the 1918 * names or unlinks in a directory. 1919 * 1920 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1921 * non-existing inode) and 1 if the name was replayed. 1922 */ 1923static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1924 struct btrfs_root *root, 1925 struct btrfs_path *path, 1926 struct extent_buffer *eb, 1927 struct btrfs_dir_item *di, 1928 struct btrfs_key *key) 1929{ 1930 char *name; 1931 int name_len; 1932 struct btrfs_dir_item *dst_di; 1933 struct btrfs_key found_key; 1934 struct btrfs_key log_key; 1935 struct inode *dir; 1936 u8 log_type; 1937 bool exists; 1938 int ret; 1939 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1940 bool name_added = false; 1941 1942 dir = read_one_inode(root, key->objectid); 1943 if (!dir) 1944 return -EIO; 1945 1946 name_len = btrfs_dir_name_len(eb, di); 1947 name = kmalloc(name_len, GFP_NOFS); 1948 if (!name) { 1949 ret = -ENOMEM; 1950 goto out; 1951 } 1952 1953 log_type = btrfs_dir_type(eb, di); 1954 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1955 name_len); 1956 1957 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1958 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1959 btrfs_release_path(path); 1960 if (ret < 0) 1961 goto out; 1962 exists = (ret == 0); 1963 ret = 0; 1964 1965 if (key->type == BTRFS_DIR_ITEM_KEY) { 1966 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1967 name, name_len, 1); 1968 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1969 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1970 key->objectid, 1971 key->offset, name, 1972 name_len, 1); 1973 } else { 1974 /* Corruption */ 1975 ret = -EINVAL; 1976 goto out; 1977 } 1978 1979 if (dst_di == ERR_PTR(-ENOENT)) 1980 dst_di = NULL; 1981 1982 if (IS_ERR(dst_di)) { 1983 ret = PTR_ERR(dst_di); 1984 goto out; 1985 } else if (!dst_di) { 1986 /* we need a sequence number to insert, so we only 1987 * do inserts for the BTRFS_DIR_INDEX_KEY types 1988 */ 1989 if (key->type != BTRFS_DIR_INDEX_KEY) 1990 goto out; 1991 goto insert; 1992 } 1993 1994 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1995 /* the existing item matches the logged item */ 1996 if (found_key.objectid == log_key.objectid && 1997 found_key.type == log_key.type && 1998 found_key.offset == log_key.offset && 1999 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 2000 update_size = false; 2001 goto out; 2002 } 2003 2004 /* 2005 * don't drop the conflicting directory entry if the inode 2006 * for the new entry doesn't exist 2007 */ 2008 if (!exists) 2009 goto out; 2010 2011 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 2012 if (ret) 2013 goto out; 2014 2015 if (key->type == BTRFS_DIR_INDEX_KEY) 2016 goto insert; 2017out: 2018 btrfs_release_path(path); 2019 if (!ret && update_size) { 2020 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2021 ret = btrfs_update_inode(trans, root, dir); 2022 } 2023 kfree(name); 2024 iput(dir); 2025 if (!ret && name_added) 2026 ret = 1; 2027 return ret; 2028 2029insert: 2030 /* 2031 * Check if the inode reference exists in the log for the given name, 2032 * inode and parent inode 2033 */ 2034 found_key.objectid = log_key.objectid; 2035 found_key.type = BTRFS_INODE_REF_KEY; 2036 found_key.offset = key->objectid; 2037 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 2038 if (ret < 0) { 2039 goto out; 2040 } else if (ret) { 2041 /* The dentry will be added later. */ 2042 ret = 0; 2043 update_size = false; 2044 goto out; 2045 } 2046 2047 found_key.objectid = log_key.objectid; 2048 found_key.type = BTRFS_INODE_EXTREF_KEY; 2049 found_key.offset = key->objectid; 2050 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2051 name_len); 2052 if (ret < 0) { 2053 goto out; 2054 } else if (ret) { 2055 /* The dentry will be added later. */ 2056 ret = 0; 2057 update_size = false; 2058 goto out; 2059 } 2060 btrfs_release_path(path); 2061 ret = insert_one_name(trans, root, key->objectid, key->offset, 2062 name, name_len, &log_key); 2063 if (ret && ret != -ENOENT && ret != -EEXIST) 2064 goto out; 2065 if (!ret) 2066 name_added = true; 2067 update_size = false; 2068 ret = 0; 2069 goto out; 2070} 2071 2072/* 2073 * find all the names in a directory item and reconcile them into 2074 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2075 * one name in a directory item, but the same code gets used for 2076 * both directory index types 2077 */ 2078static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2079 struct btrfs_root *root, 2080 struct btrfs_path *path, 2081 struct extent_buffer *eb, int slot, 2082 struct btrfs_key *key) 2083{ 2084 int ret = 0; 2085 u32 item_size = btrfs_item_size_nr(eb, slot); 2086 struct btrfs_dir_item *di; 2087 int name_len; 2088 unsigned long ptr; 2089 unsigned long ptr_end; 2090 struct btrfs_path *fixup_path = NULL; 2091 2092 ptr = btrfs_item_ptr_offset(eb, slot); 2093 ptr_end = ptr + item_size; 2094 while (ptr < ptr_end) { 2095 di = (struct btrfs_dir_item *)ptr; 2096 name_len = btrfs_dir_name_len(eb, di); 2097 ret = replay_one_name(trans, root, path, eb, di, key); 2098 if (ret < 0) 2099 break; 2100 ptr = (unsigned long)(di + 1); 2101 ptr += name_len; 2102 2103 /* 2104 * If this entry refers to a non-directory (directories can not 2105 * have a link count > 1) and it was added in the transaction 2106 * that was not committed, make sure we fixup the link count of 2107 * the inode it the entry points to. Otherwise something like 2108 * the following would result in a directory pointing to an 2109 * inode with a wrong link that does not account for this dir 2110 * entry: 2111 * 2112 * mkdir testdir 2113 * touch testdir/foo 2114 * touch testdir/bar 2115 * sync 2116 * 2117 * ln testdir/bar testdir/bar_link 2118 * ln testdir/foo testdir/foo_link 2119 * xfs_io -c "fsync" testdir/bar 2120 * 2121 * <power failure> 2122 * 2123 * mount fs, log replay happens 2124 * 2125 * File foo would remain with a link count of 1 when it has two 2126 * entries pointing to it in the directory testdir. This would 2127 * make it impossible to ever delete the parent directory has 2128 * it would result in stale dentries that can never be deleted. 2129 */ 2130 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2131 struct btrfs_key di_key; 2132 2133 if (!fixup_path) { 2134 fixup_path = btrfs_alloc_path(); 2135 if (!fixup_path) { 2136 ret = -ENOMEM; 2137 break; 2138 } 2139 } 2140 2141 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2142 ret = link_to_fixup_dir(trans, root, fixup_path, 2143 di_key.objectid); 2144 if (ret) 2145 break; 2146 } 2147 ret = 0; 2148 } 2149 btrfs_free_path(fixup_path); 2150 return ret; 2151} 2152 2153/* 2154 * directory replay has two parts. There are the standard directory 2155 * items in the log copied from the subvolume, and range items 2156 * created in the log while the subvolume was logged. 2157 * 2158 * The range items tell us which parts of the key space the log 2159 * is authoritative for. During replay, if a key in the subvolume 2160 * directory is in a logged range item, but not actually in the log 2161 * that means it was deleted from the directory before the fsync 2162 * and should be removed. 2163 */ 2164static noinline int find_dir_range(struct btrfs_root *root, 2165 struct btrfs_path *path, 2166 u64 dirid, int key_type, 2167 u64 *start_ret, u64 *end_ret) 2168{ 2169 struct btrfs_key key; 2170 u64 found_end; 2171 struct btrfs_dir_log_item *item; 2172 int ret; 2173 int nritems; 2174 2175 if (*start_ret == (u64)-1) 2176 return 1; 2177 2178 key.objectid = dirid; 2179 key.type = key_type; 2180 key.offset = *start_ret; 2181 2182 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2183 if (ret < 0) 2184 goto out; 2185 if (ret > 0) { 2186 if (path->slots[0] == 0) 2187 goto out; 2188 path->slots[0]--; 2189 } 2190 if (ret != 0) 2191 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2192 2193 if (key.type != key_type || key.objectid != dirid) { 2194 ret = 1; 2195 goto next; 2196 } 2197 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2198 struct btrfs_dir_log_item); 2199 found_end = btrfs_dir_log_end(path->nodes[0], item); 2200 2201 if (*start_ret >= key.offset && *start_ret <= found_end) { 2202 ret = 0; 2203 *start_ret = key.offset; 2204 *end_ret = found_end; 2205 goto out; 2206 } 2207 ret = 1; 2208next: 2209 /* check the next slot in the tree to see if it is a valid item */ 2210 nritems = btrfs_header_nritems(path->nodes[0]); 2211 path->slots[0]++; 2212 if (path->slots[0] >= nritems) { 2213 ret = btrfs_next_leaf(root, path); 2214 if (ret) 2215 goto out; 2216 } 2217 2218 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2219 2220 if (key.type != key_type || key.objectid != dirid) { 2221 ret = 1; 2222 goto out; 2223 } 2224 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2225 struct btrfs_dir_log_item); 2226 found_end = btrfs_dir_log_end(path->nodes[0], item); 2227 *start_ret = key.offset; 2228 *end_ret = found_end; 2229 ret = 0; 2230out: 2231 btrfs_release_path(path); 2232 return ret; 2233} 2234 2235/* 2236 * this looks for a given directory item in the log. If the directory 2237 * item is not in the log, the item is removed and the inode it points 2238 * to is unlinked 2239 */ 2240static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2241 struct btrfs_root *root, 2242 struct btrfs_root *log, 2243 struct btrfs_path *path, 2244 struct btrfs_path *log_path, 2245 struct inode *dir, 2246 struct btrfs_key *dir_key) 2247{ 2248 int ret; 2249 struct extent_buffer *eb; 2250 int slot; 2251 u32 item_size; 2252 struct btrfs_dir_item *di; 2253 struct btrfs_dir_item *log_di; 2254 int name_len; 2255 unsigned long ptr; 2256 unsigned long ptr_end; 2257 char *name; 2258 struct inode *inode; 2259 struct btrfs_key location; 2260 2261again: 2262 eb = path->nodes[0]; 2263 slot = path->slots[0]; 2264 item_size = btrfs_item_size_nr(eb, slot); 2265 ptr = btrfs_item_ptr_offset(eb, slot); 2266 ptr_end = ptr + item_size; 2267 while (ptr < ptr_end) { 2268 di = (struct btrfs_dir_item *)ptr; 2269 name_len = btrfs_dir_name_len(eb, di); 2270 name = kmalloc(name_len, GFP_NOFS); 2271 if (!name) { 2272 ret = -ENOMEM; 2273 goto out; 2274 } 2275 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2276 name_len); 2277 log_di = NULL; 2278 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2279 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2280 dir_key->objectid, 2281 name, name_len, 0); 2282 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2283 log_di = btrfs_lookup_dir_index_item(trans, log, 2284 log_path, 2285 dir_key->objectid, 2286 dir_key->offset, 2287 name, name_len, 0); 2288 } 2289 if (!log_di || log_di == ERR_PTR(-ENOENT)) { 2290 btrfs_dir_item_key_to_cpu(eb, di, &location); 2291 btrfs_release_path(path); 2292 btrfs_release_path(log_path); 2293 inode = read_one_inode(root, location.objectid); 2294 if (!inode) { 2295 kfree(name); 2296 return -EIO; 2297 } 2298 2299 ret = link_to_fixup_dir(trans, root, 2300 path, location.objectid); 2301 if (ret) { 2302 kfree(name); 2303 iput(inode); 2304 goto out; 2305 } 2306 2307 inc_nlink(inode); 2308 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2309 BTRFS_I(inode), name, name_len); 2310 if (!ret) 2311 ret = btrfs_run_delayed_items(trans); 2312 kfree(name); 2313 iput(inode); 2314 if (ret) 2315 goto out; 2316 2317 /* there might still be more names under this key 2318 * check and repeat if required 2319 */ 2320 ret = btrfs_search_slot(NULL, root, dir_key, path, 2321 0, 0); 2322 if (ret == 0) 2323 goto again; 2324 ret = 0; 2325 goto out; 2326 } else if (IS_ERR(log_di)) { 2327 kfree(name); 2328 return PTR_ERR(log_di); 2329 } 2330 btrfs_release_path(log_path); 2331 kfree(name); 2332 2333 ptr = (unsigned long)(di + 1); 2334 ptr += name_len; 2335 } 2336 ret = 0; 2337out: 2338 btrfs_release_path(path); 2339 btrfs_release_path(log_path); 2340 return ret; 2341} 2342 2343static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2344 struct btrfs_root *root, 2345 struct btrfs_root *log, 2346 struct btrfs_path *path, 2347 const u64 ino) 2348{ 2349 struct btrfs_key search_key; 2350 struct btrfs_path *log_path; 2351 int i; 2352 int nritems; 2353 int ret; 2354 2355 log_path = btrfs_alloc_path(); 2356 if (!log_path) 2357 return -ENOMEM; 2358 2359 search_key.objectid = ino; 2360 search_key.type = BTRFS_XATTR_ITEM_KEY; 2361 search_key.offset = 0; 2362again: 2363 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2364 if (ret < 0) 2365 goto out; 2366process_leaf: 2367 nritems = btrfs_header_nritems(path->nodes[0]); 2368 for (i = path->slots[0]; i < nritems; i++) { 2369 struct btrfs_key key; 2370 struct btrfs_dir_item *di; 2371 struct btrfs_dir_item *log_di; 2372 u32 total_size; 2373 u32 cur; 2374 2375 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2376 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2377 ret = 0; 2378 goto out; 2379 } 2380 2381 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2382 total_size = btrfs_item_size_nr(path->nodes[0], i); 2383 cur = 0; 2384 while (cur < total_size) { 2385 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2386 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2387 u32 this_len = sizeof(*di) + name_len + data_len; 2388 char *name; 2389 2390 name = kmalloc(name_len, GFP_NOFS); 2391 if (!name) { 2392 ret = -ENOMEM; 2393 goto out; 2394 } 2395 read_extent_buffer(path->nodes[0], name, 2396 (unsigned long)(di + 1), name_len); 2397 2398 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2399 name, name_len, 0); 2400 btrfs_release_path(log_path); 2401 if (!log_di) { 2402 /* Doesn't exist in log tree, so delete it. */ 2403 btrfs_release_path(path); 2404 di = btrfs_lookup_xattr(trans, root, path, ino, 2405 name, name_len, -1); 2406 kfree(name); 2407 if (IS_ERR(di)) { 2408 ret = PTR_ERR(di); 2409 goto out; 2410 } 2411 ASSERT(di); 2412 ret = btrfs_delete_one_dir_name(trans, root, 2413 path, di); 2414 if (ret) 2415 goto out; 2416 btrfs_release_path(path); 2417 search_key = key; 2418 goto again; 2419 } 2420 kfree(name); 2421 if (IS_ERR(log_di)) { 2422 ret = PTR_ERR(log_di); 2423 goto out; 2424 } 2425 cur += this_len; 2426 di = (struct btrfs_dir_item *)((char *)di + this_len); 2427 } 2428 } 2429 ret = btrfs_next_leaf(root, path); 2430 if (ret > 0) 2431 ret = 0; 2432 else if (ret == 0) 2433 goto process_leaf; 2434out: 2435 btrfs_free_path(log_path); 2436 btrfs_release_path(path); 2437 return ret; 2438} 2439 2440 2441/* 2442 * deletion replay happens before we copy any new directory items 2443 * out of the log or out of backreferences from inodes. It 2444 * scans the log to find ranges of keys that log is authoritative for, 2445 * and then scans the directory to find items in those ranges that are 2446 * not present in the log. 2447 * 2448 * Anything we don't find in the log is unlinked and removed from the 2449 * directory. 2450 */ 2451static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2452 struct btrfs_root *root, 2453 struct btrfs_root *log, 2454 struct btrfs_path *path, 2455 u64 dirid, int del_all) 2456{ 2457 u64 range_start; 2458 u64 range_end; 2459 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2460 int ret = 0; 2461 struct btrfs_key dir_key; 2462 struct btrfs_key found_key; 2463 struct btrfs_path *log_path; 2464 struct inode *dir; 2465 2466 dir_key.objectid = dirid; 2467 dir_key.type = BTRFS_DIR_ITEM_KEY; 2468 log_path = btrfs_alloc_path(); 2469 if (!log_path) 2470 return -ENOMEM; 2471 2472 dir = read_one_inode(root, dirid); 2473 /* it isn't an error if the inode isn't there, that can happen 2474 * because we replay the deletes before we copy in the inode item 2475 * from the log 2476 */ 2477 if (!dir) { 2478 btrfs_free_path(log_path); 2479 return 0; 2480 } 2481again: 2482 range_start = 0; 2483 range_end = 0; 2484 while (1) { 2485 if (del_all) 2486 range_end = (u64)-1; 2487 else { 2488 ret = find_dir_range(log, path, dirid, key_type, 2489 &range_start, &range_end); 2490 if (ret < 0) 2491 goto out; 2492 else if (ret > 0) 2493 break; 2494 } 2495 2496 dir_key.offset = range_start; 2497 while (1) { 2498 int nritems; 2499 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2500 0, 0); 2501 if (ret < 0) 2502 goto out; 2503 2504 nritems = btrfs_header_nritems(path->nodes[0]); 2505 if (path->slots[0] >= nritems) { 2506 ret = btrfs_next_leaf(root, path); 2507 if (ret == 1) 2508 break; 2509 else if (ret < 0) 2510 goto out; 2511 } 2512 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2513 path->slots[0]); 2514 if (found_key.objectid != dirid || 2515 found_key.type != dir_key.type) 2516 goto next_type; 2517 2518 if (found_key.offset > range_end) 2519 break; 2520 2521 ret = check_item_in_log(trans, root, log, path, 2522 log_path, dir, 2523 &found_key); 2524 if (ret) 2525 goto out; 2526 if (found_key.offset == (u64)-1) 2527 break; 2528 dir_key.offset = found_key.offset + 1; 2529 } 2530 btrfs_release_path(path); 2531 if (range_end == (u64)-1) 2532 break; 2533 range_start = range_end + 1; 2534 } 2535 2536next_type: 2537 ret = 0; 2538 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2539 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2540 dir_key.type = BTRFS_DIR_INDEX_KEY; 2541 btrfs_release_path(path); 2542 goto again; 2543 } 2544out: 2545 btrfs_release_path(path); 2546 btrfs_free_path(log_path); 2547 iput(dir); 2548 return ret; 2549} 2550 2551/* 2552 * the process_func used to replay items from the log tree. This 2553 * gets called in two different stages. The first stage just looks 2554 * for inodes and makes sure they are all copied into the subvolume. 2555 * 2556 * The second stage copies all the other item types from the log into 2557 * the subvolume. The two stage approach is slower, but gets rid of 2558 * lots of complexity around inodes referencing other inodes that exist 2559 * only in the log (references come from either directory items or inode 2560 * back refs). 2561 */ 2562static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2563 struct walk_control *wc, u64 gen, int level) 2564{ 2565 int nritems; 2566 struct btrfs_path *path; 2567 struct btrfs_root *root = wc->replay_dest; 2568 struct btrfs_key key; 2569 int i; 2570 int ret; 2571 2572 ret = btrfs_read_buffer(eb, gen, level, NULL); 2573 if (ret) 2574 return ret; 2575 2576 level = btrfs_header_level(eb); 2577 2578 if (level != 0) 2579 return 0; 2580 2581 path = btrfs_alloc_path(); 2582 if (!path) 2583 return -ENOMEM; 2584 2585 nritems = btrfs_header_nritems(eb); 2586 for (i = 0; i < nritems; i++) { 2587 btrfs_item_key_to_cpu(eb, &key, i); 2588 2589 /* inode keys are done during the first stage */ 2590 if (key.type == BTRFS_INODE_ITEM_KEY && 2591 wc->stage == LOG_WALK_REPLAY_INODES) { 2592 struct btrfs_inode_item *inode_item; 2593 u32 mode; 2594 2595 inode_item = btrfs_item_ptr(eb, i, 2596 struct btrfs_inode_item); 2597 /* 2598 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2599 * and never got linked before the fsync, skip it, as 2600 * replaying it is pointless since it would be deleted 2601 * later. We skip logging tmpfiles, but it's always 2602 * possible we are replaying a log created with a kernel 2603 * that used to log tmpfiles. 2604 */ 2605 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2606 wc->ignore_cur_inode = true; 2607 continue; 2608 } else { 2609 wc->ignore_cur_inode = false; 2610 } 2611 ret = replay_xattr_deletes(wc->trans, root, log, 2612 path, key.objectid); 2613 if (ret) 2614 break; 2615 mode = btrfs_inode_mode(eb, inode_item); 2616 if (S_ISDIR(mode)) { 2617 ret = replay_dir_deletes(wc->trans, 2618 root, log, path, key.objectid, 0); 2619 if (ret) 2620 break; 2621 } 2622 ret = overwrite_item(wc->trans, root, path, 2623 eb, i, &key); 2624 if (ret) 2625 break; 2626 2627 /* 2628 * Before replaying extents, truncate the inode to its 2629 * size. We need to do it now and not after log replay 2630 * because before an fsync we can have prealloc extents 2631 * added beyond the inode's i_size. If we did it after, 2632 * through orphan cleanup for example, we would drop 2633 * those prealloc extents just after replaying them. 2634 */ 2635 if (S_ISREG(mode)) { 2636 struct inode *inode; 2637 u64 from; 2638 2639 inode = read_one_inode(root, key.objectid); 2640 if (!inode) { 2641 ret = -EIO; 2642 break; 2643 } 2644 from = ALIGN(i_size_read(inode), 2645 root->fs_info->sectorsize); 2646 ret = btrfs_drop_extents(wc->trans, root, inode, 2647 from, (u64)-1, 1); 2648 if (!ret) { 2649 /* Update the inode's nbytes. */ 2650 ret = btrfs_update_inode(wc->trans, 2651 root, inode); 2652 } 2653 iput(inode); 2654 if (ret) 2655 break; 2656 } 2657 2658 ret = link_to_fixup_dir(wc->trans, root, 2659 path, key.objectid); 2660 if (ret) 2661 break; 2662 } 2663 2664 if (wc->ignore_cur_inode) 2665 continue; 2666 2667 if (key.type == BTRFS_DIR_INDEX_KEY && 2668 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2669 ret = replay_one_dir_item(wc->trans, root, path, 2670 eb, i, &key); 2671 if (ret) 2672 break; 2673 } 2674 2675 if (wc->stage < LOG_WALK_REPLAY_ALL) 2676 continue; 2677 2678 /* these keys are simply copied */ 2679 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2680 ret = overwrite_item(wc->trans, root, path, 2681 eb, i, &key); 2682 if (ret) 2683 break; 2684 } else if (key.type == BTRFS_INODE_REF_KEY || 2685 key.type == BTRFS_INODE_EXTREF_KEY) { 2686 ret = add_inode_ref(wc->trans, root, log, path, 2687 eb, i, &key); 2688 if (ret && ret != -ENOENT) 2689 break; 2690 ret = 0; 2691 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2692 ret = replay_one_extent(wc->trans, root, path, 2693 eb, i, &key); 2694 if (ret) 2695 break; 2696 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2697 ret = replay_one_dir_item(wc->trans, root, path, 2698 eb, i, &key); 2699 if (ret) 2700 break; 2701 } 2702 } 2703 btrfs_free_path(path); 2704 return ret; 2705} 2706 2707/* 2708 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2709 */ 2710static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2711{ 2712 struct btrfs_block_group *cache; 2713 2714 cache = btrfs_lookup_block_group(fs_info, start); 2715 if (!cache) { 2716 btrfs_err(fs_info, "unable to find block group for %llu", start); 2717 return; 2718 } 2719 2720 spin_lock(&cache->space_info->lock); 2721 spin_lock(&cache->lock); 2722 cache->reserved -= fs_info->nodesize; 2723 cache->space_info->bytes_reserved -= fs_info->nodesize; 2724 spin_unlock(&cache->lock); 2725 spin_unlock(&cache->space_info->lock); 2726 2727 btrfs_put_block_group(cache); 2728} 2729 2730static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2731 struct btrfs_root *root, 2732 struct btrfs_path *path, int *level, 2733 struct walk_control *wc) 2734{ 2735 struct btrfs_fs_info *fs_info = root->fs_info; 2736 u64 bytenr; 2737 u64 ptr_gen; 2738 struct extent_buffer *next; 2739 struct extent_buffer *cur; 2740 u32 blocksize; 2741 int ret = 0; 2742 2743 while (*level > 0) { 2744 struct btrfs_key first_key; 2745 2746 cur = path->nodes[*level]; 2747 2748 WARN_ON(btrfs_header_level(cur) != *level); 2749 2750 if (path->slots[*level] >= 2751 btrfs_header_nritems(cur)) 2752 break; 2753 2754 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2755 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2756 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2757 blocksize = fs_info->nodesize; 2758 2759 next = btrfs_find_create_tree_block(fs_info, bytenr); 2760 if (IS_ERR(next)) 2761 return PTR_ERR(next); 2762 2763 if (*level == 1) { 2764 ret = wc->process_func(root, next, wc, ptr_gen, 2765 *level - 1); 2766 if (ret) { 2767 free_extent_buffer(next); 2768 return ret; 2769 } 2770 2771 path->slots[*level]++; 2772 if (wc->free) { 2773 ret = btrfs_read_buffer(next, ptr_gen, 2774 *level - 1, &first_key); 2775 if (ret) { 2776 free_extent_buffer(next); 2777 return ret; 2778 } 2779 2780 if (trans) { 2781 btrfs_tree_lock(next); 2782 btrfs_set_lock_blocking_write(next); 2783 btrfs_clean_tree_block(next); 2784 btrfs_wait_tree_block_writeback(next); 2785 btrfs_tree_unlock(next); 2786 ret = btrfs_pin_reserved_extent(trans, 2787 bytenr, blocksize); 2788 if (ret) { 2789 free_extent_buffer(next); 2790 return ret; 2791 } 2792 } else { 2793 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2794 clear_extent_buffer_dirty(next); 2795 unaccount_log_buffer(fs_info, bytenr); 2796 } 2797 } 2798 free_extent_buffer(next); 2799 continue; 2800 } 2801 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2802 if (ret) { 2803 free_extent_buffer(next); 2804 return ret; 2805 } 2806 2807 if (path->nodes[*level-1]) 2808 free_extent_buffer(path->nodes[*level-1]); 2809 path->nodes[*level-1] = next; 2810 *level = btrfs_header_level(next); 2811 path->slots[*level] = 0; 2812 cond_resched(); 2813 } 2814 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2815 2816 cond_resched(); 2817 return 0; 2818} 2819 2820static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2821 struct btrfs_root *root, 2822 struct btrfs_path *path, int *level, 2823 struct walk_control *wc) 2824{ 2825 struct btrfs_fs_info *fs_info = root->fs_info; 2826 int i; 2827 int slot; 2828 int ret; 2829 2830 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2831 slot = path->slots[i]; 2832 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2833 path->slots[i]++; 2834 *level = i; 2835 WARN_ON(*level == 0); 2836 return 0; 2837 } else { 2838 ret = wc->process_func(root, path->nodes[*level], wc, 2839 btrfs_header_generation(path->nodes[*level]), 2840 *level); 2841 if (ret) 2842 return ret; 2843 2844 if (wc->free) { 2845 struct extent_buffer *next; 2846 2847 next = path->nodes[*level]; 2848 2849 if (trans) { 2850 btrfs_tree_lock(next); 2851 btrfs_set_lock_blocking_write(next); 2852 btrfs_clean_tree_block(next); 2853 btrfs_wait_tree_block_writeback(next); 2854 btrfs_tree_unlock(next); 2855 ret = btrfs_pin_reserved_extent(trans, 2856 path->nodes[*level]->start, 2857 path->nodes[*level]->len); 2858 if (ret) 2859 return ret; 2860 } else { 2861 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2862 clear_extent_buffer_dirty(next); 2863 2864 unaccount_log_buffer(fs_info, 2865 path->nodes[*level]->start); 2866 } 2867 } 2868 free_extent_buffer(path->nodes[*level]); 2869 path->nodes[*level] = NULL; 2870 *level = i + 1; 2871 } 2872 } 2873 return 1; 2874} 2875 2876/* 2877 * drop the reference count on the tree rooted at 'snap'. This traverses 2878 * the tree freeing any blocks that have a ref count of zero after being 2879 * decremented. 2880 */ 2881static int walk_log_tree(struct btrfs_trans_handle *trans, 2882 struct btrfs_root *log, struct walk_control *wc) 2883{ 2884 struct btrfs_fs_info *fs_info = log->fs_info; 2885 int ret = 0; 2886 int wret; 2887 int level; 2888 struct btrfs_path *path; 2889 int orig_level; 2890 2891 path = btrfs_alloc_path(); 2892 if (!path) 2893 return -ENOMEM; 2894 2895 level = btrfs_header_level(log->node); 2896 orig_level = level; 2897 path->nodes[level] = log->node; 2898 atomic_inc(&log->node->refs); 2899 path->slots[level] = 0; 2900 2901 while (1) { 2902 wret = walk_down_log_tree(trans, log, path, &level, wc); 2903 if (wret > 0) 2904 break; 2905 if (wret < 0) { 2906 ret = wret; 2907 goto out; 2908 } 2909 2910 wret = walk_up_log_tree(trans, log, path, &level, wc); 2911 if (wret > 0) 2912 break; 2913 if (wret < 0) { 2914 ret = wret; 2915 goto out; 2916 } 2917 } 2918 2919 /* was the root node processed? if not, catch it here */ 2920 if (path->nodes[orig_level]) { 2921 ret = wc->process_func(log, path->nodes[orig_level], wc, 2922 btrfs_header_generation(path->nodes[orig_level]), 2923 orig_level); 2924 if (ret) 2925 goto out; 2926 if (wc->free) { 2927 struct extent_buffer *next; 2928 2929 next = path->nodes[orig_level]; 2930 2931 if (trans) { 2932 btrfs_tree_lock(next); 2933 btrfs_set_lock_blocking_write(next); 2934 btrfs_clean_tree_block(next); 2935 btrfs_wait_tree_block_writeback(next); 2936 btrfs_tree_unlock(next); 2937 ret = btrfs_pin_reserved_extent(trans, 2938 next->start, next->len); 2939 if (ret) 2940 goto out; 2941 } else { 2942 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2943 clear_extent_buffer_dirty(next); 2944 unaccount_log_buffer(fs_info, next->start); 2945 } 2946 } 2947 } 2948 2949out: 2950 btrfs_free_path(path); 2951 return ret; 2952} 2953 2954/* 2955 * helper function to update the item for a given subvolumes log root 2956 * in the tree of log roots 2957 */ 2958static int update_log_root(struct btrfs_trans_handle *trans, 2959 struct btrfs_root *log, 2960 struct btrfs_root_item *root_item) 2961{ 2962 struct btrfs_fs_info *fs_info = log->fs_info; 2963 int ret; 2964 2965 if (log->log_transid == 1) { 2966 /* insert root item on the first sync */ 2967 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2968 &log->root_key, root_item); 2969 } else { 2970 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2971 &log->root_key, root_item); 2972 } 2973 return ret; 2974} 2975 2976static void wait_log_commit(struct btrfs_root *root, int transid) 2977{ 2978 DEFINE_WAIT(wait); 2979 int index = transid % 2; 2980 2981 /* 2982 * we only allow two pending log transactions at a time, 2983 * so we know that if ours is more than 2 older than the 2984 * current transaction, we're done 2985 */ 2986 for (;;) { 2987 prepare_to_wait(&root->log_commit_wait[index], 2988 &wait, TASK_UNINTERRUPTIBLE); 2989 2990 if (!(root->log_transid_committed < transid && 2991 atomic_read(&root->log_commit[index]))) 2992 break; 2993 2994 mutex_unlock(&root->log_mutex); 2995 schedule(); 2996 mutex_lock(&root->log_mutex); 2997 } 2998 finish_wait(&root->log_commit_wait[index], &wait); 2999} 3000 3001static void wait_for_writer(struct btrfs_root *root) 3002{ 3003 DEFINE_WAIT(wait); 3004 3005 for (;;) { 3006 prepare_to_wait(&root->log_writer_wait, &wait, 3007 TASK_UNINTERRUPTIBLE); 3008 if (!atomic_read(&root->log_writers)) 3009 break; 3010 3011 mutex_unlock(&root->log_mutex); 3012 schedule(); 3013 mutex_lock(&root->log_mutex); 3014 } 3015 finish_wait(&root->log_writer_wait, &wait); 3016} 3017 3018static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3019 struct btrfs_log_ctx *ctx) 3020{ 3021 if (!ctx) 3022 return; 3023 3024 mutex_lock(&root->log_mutex); 3025 list_del_init(&ctx->list); 3026 mutex_unlock(&root->log_mutex); 3027} 3028 3029/* 3030 * Invoked in log mutex context, or be sure there is no other task which 3031 * can access the list. 3032 */ 3033static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3034 int index, int error) 3035{ 3036 struct btrfs_log_ctx *ctx; 3037 struct btrfs_log_ctx *safe; 3038 3039 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3040 list_del_init(&ctx->list); 3041 ctx->log_ret = error; 3042 } 3043 3044 INIT_LIST_HEAD(&root->log_ctxs[index]); 3045} 3046 3047/* 3048 * btrfs_sync_log does sends a given tree log down to the disk and 3049 * updates the super blocks to record it. When this call is done, 3050 * you know that any inodes previously logged are safely on disk only 3051 * if it returns 0. 3052 * 3053 * Any other return value means you need to call btrfs_commit_transaction. 3054 * Some of the edge cases for fsyncing directories that have had unlinks 3055 * or renames done in the past mean that sometimes the only safe 3056 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3057 * that has happened. 3058 */ 3059int btrfs_sync_log(struct btrfs_trans_handle *trans, 3060 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3061{ 3062 int index1; 3063 int index2; 3064 int mark; 3065 int ret; 3066 struct btrfs_fs_info *fs_info = root->fs_info; 3067 struct btrfs_root *log = root->log_root; 3068 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3069 struct btrfs_root_item new_root_item; 3070 int log_transid = 0; 3071 struct btrfs_log_ctx root_log_ctx; 3072 struct blk_plug plug; 3073 3074 mutex_lock(&root->log_mutex); 3075 log_transid = ctx->log_transid; 3076 if (root->log_transid_committed >= log_transid) { 3077 mutex_unlock(&root->log_mutex); 3078 return ctx->log_ret; 3079 } 3080 3081 index1 = log_transid % 2; 3082 if (atomic_read(&root->log_commit[index1])) { 3083 wait_log_commit(root, log_transid); 3084 mutex_unlock(&root->log_mutex); 3085 return ctx->log_ret; 3086 } 3087 ASSERT(log_transid == root->log_transid); 3088 atomic_set(&root->log_commit[index1], 1); 3089 3090 /* wait for previous tree log sync to complete */ 3091 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3092 wait_log_commit(root, log_transid - 1); 3093 3094 while (1) { 3095 int batch = atomic_read(&root->log_batch); 3096 /* when we're on an ssd, just kick the log commit out */ 3097 if (!btrfs_test_opt(fs_info, SSD) && 3098 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3099 mutex_unlock(&root->log_mutex); 3100 schedule_timeout_uninterruptible(1); 3101 mutex_lock(&root->log_mutex); 3102 } 3103 wait_for_writer(root); 3104 if (batch == atomic_read(&root->log_batch)) 3105 break; 3106 } 3107 3108 /* bail out if we need to do a full commit */ 3109 if (btrfs_need_log_full_commit(trans)) { 3110 ret = -EAGAIN; 3111 mutex_unlock(&root->log_mutex); 3112 goto out; 3113 } 3114 3115 if (log_transid % 2 == 0) 3116 mark = EXTENT_DIRTY; 3117 else 3118 mark = EXTENT_NEW; 3119 3120 /* we start IO on all the marked extents here, but we don't actually 3121 * wait for them until later. 3122 */ 3123 blk_start_plug(&plug); 3124 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3125 if (ret) { 3126 blk_finish_plug(&plug); 3127 btrfs_abort_transaction(trans, ret); 3128 btrfs_set_log_full_commit(trans); 3129 mutex_unlock(&root->log_mutex); 3130 goto out; 3131 } 3132 3133 /* 3134 * We _must_ update under the root->log_mutex in order to make sure we 3135 * have a consistent view of the log root we are trying to commit at 3136 * this moment. 3137 * 3138 * We _must_ copy this into a local copy, because we are not holding the 3139 * log_root_tree->log_mutex yet. This is important because when we 3140 * commit the log_root_tree we must have a consistent view of the 3141 * log_root_tree when we update the super block to point at the 3142 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3143 * with the commit and possibly point at the new block which we may not 3144 * have written out. 3145 */ 3146 btrfs_set_root_node(&log->root_item, log->node); 3147 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3148 3149 root->log_transid++; 3150 log->log_transid = root->log_transid; 3151 root->log_start_pid = 0; 3152 /* 3153 * IO has been started, blocks of the log tree have WRITTEN flag set 3154 * in their headers. new modifications of the log will be written to 3155 * new positions. so it's safe to allow log writers to go in. 3156 */ 3157 mutex_unlock(&root->log_mutex); 3158 3159 btrfs_init_log_ctx(&root_log_ctx, NULL); 3160 3161 mutex_lock(&log_root_tree->log_mutex); 3162 3163 index2 = log_root_tree->log_transid % 2; 3164 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3165 root_log_ctx.log_transid = log_root_tree->log_transid; 3166 3167 /* 3168 * Now we are safe to update the log_root_tree because we're under the 3169 * log_mutex, and we're a current writer so we're holding the commit 3170 * open until we drop the log_mutex. 3171 */ 3172 ret = update_log_root(trans, log, &new_root_item); 3173 if (ret) { 3174 if (!list_empty(&root_log_ctx.list)) 3175 list_del_init(&root_log_ctx.list); 3176 3177 blk_finish_plug(&plug); 3178 btrfs_set_log_full_commit(trans); 3179 3180 if (ret != -ENOSPC) { 3181 btrfs_abort_transaction(trans, ret); 3182 mutex_unlock(&log_root_tree->log_mutex); 3183 goto out; 3184 } 3185 btrfs_wait_tree_log_extents(log, mark); 3186 mutex_unlock(&log_root_tree->log_mutex); 3187 ret = -EAGAIN; 3188 goto out; 3189 } 3190 3191 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3192 blk_finish_plug(&plug); 3193 list_del_init(&root_log_ctx.list); 3194 mutex_unlock(&log_root_tree->log_mutex); 3195 ret = root_log_ctx.log_ret; 3196 goto out; 3197 } 3198 3199 index2 = root_log_ctx.log_transid % 2; 3200 if (atomic_read(&log_root_tree->log_commit[index2])) { 3201 blk_finish_plug(&plug); 3202 ret = btrfs_wait_tree_log_extents(log, mark); 3203 wait_log_commit(log_root_tree, 3204 root_log_ctx.log_transid); 3205 mutex_unlock(&log_root_tree->log_mutex); 3206 if (!ret) 3207 ret = root_log_ctx.log_ret; 3208 goto out; 3209 } 3210 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3211 atomic_set(&log_root_tree->log_commit[index2], 1); 3212 3213 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3214 wait_log_commit(log_root_tree, 3215 root_log_ctx.log_transid - 1); 3216 } 3217 3218 /* 3219 * now that we've moved on to the tree of log tree roots, 3220 * check the full commit flag again 3221 */ 3222 if (btrfs_need_log_full_commit(trans)) { 3223 blk_finish_plug(&plug); 3224 btrfs_wait_tree_log_extents(log, mark); 3225 mutex_unlock(&log_root_tree->log_mutex); 3226 ret = -EAGAIN; 3227 goto out_wake_log_root; 3228 } 3229 3230 ret = btrfs_write_marked_extents(fs_info, 3231 &log_root_tree->dirty_log_pages, 3232 EXTENT_DIRTY | EXTENT_NEW); 3233 blk_finish_plug(&plug); 3234 if (ret) { 3235 btrfs_set_log_full_commit(trans); 3236 btrfs_abort_transaction(trans, ret); 3237 mutex_unlock(&log_root_tree->log_mutex); 3238 goto out_wake_log_root; 3239 } 3240 ret = btrfs_wait_tree_log_extents(log, mark); 3241 if (!ret) 3242 ret = btrfs_wait_tree_log_extents(log_root_tree, 3243 EXTENT_NEW | EXTENT_DIRTY); 3244 if (ret) { 3245 btrfs_set_log_full_commit(trans); 3246 mutex_unlock(&log_root_tree->log_mutex); 3247 goto out_wake_log_root; 3248 } 3249 3250 btrfs_set_super_log_root(fs_info->super_for_commit, 3251 log_root_tree->node->start); 3252 btrfs_set_super_log_root_level(fs_info->super_for_commit, 3253 btrfs_header_level(log_root_tree->node)); 3254 3255 log_root_tree->log_transid++; 3256 mutex_unlock(&log_root_tree->log_mutex); 3257 3258 /* 3259 * Nobody else is going to jump in and write the ctree 3260 * super here because the log_commit atomic below is protecting 3261 * us. We must be called with a transaction handle pinning 3262 * the running transaction open, so a full commit can't hop 3263 * in and cause problems either. 3264 */ 3265 ret = write_all_supers(fs_info, 1); 3266 if (ret) { 3267 btrfs_set_log_full_commit(trans); 3268 btrfs_abort_transaction(trans, ret); 3269 goto out_wake_log_root; 3270 } 3271 3272 mutex_lock(&root->log_mutex); 3273 if (root->last_log_commit < log_transid) 3274 root->last_log_commit = log_transid; 3275 mutex_unlock(&root->log_mutex); 3276 3277out_wake_log_root: 3278 mutex_lock(&log_root_tree->log_mutex); 3279 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3280 3281 log_root_tree->log_transid_committed++; 3282 atomic_set(&log_root_tree->log_commit[index2], 0); 3283 mutex_unlock(&log_root_tree->log_mutex); 3284 3285 /* 3286 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3287 * all the updates above are seen by the woken threads. It might not be 3288 * necessary, but proving that seems to be hard. 3289 */ 3290 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3291out: 3292 mutex_lock(&root->log_mutex); 3293 btrfs_remove_all_log_ctxs(root, index1, ret); 3294 root->log_transid_committed++; 3295 atomic_set(&root->log_commit[index1], 0); 3296 mutex_unlock(&root->log_mutex); 3297 3298 /* 3299 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3300 * all the updates above are seen by the woken threads. It might not be 3301 * necessary, but proving that seems to be hard. 3302 */ 3303 cond_wake_up(&root->log_commit_wait[index1]); 3304 return ret; 3305} 3306 3307static void free_log_tree(struct btrfs_trans_handle *trans, 3308 struct btrfs_root *log) 3309{ 3310 int ret; 3311 struct walk_control wc = { 3312 .free = 1, 3313 .process_func = process_one_buffer 3314 }; 3315 3316 ret = walk_log_tree(trans, log, &wc); 3317 if (ret) { 3318 if (trans) 3319 btrfs_abort_transaction(trans, ret); 3320 else 3321 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3322 } 3323 3324 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3325 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3326 extent_io_tree_release(&log->log_csum_range); 3327 btrfs_put_root(log); 3328} 3329 3330/* 3331 * free all the extents used by the tree log. This should be called 3332 * at commit time of the full transaction 3333 */ 3334int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3335{ 3336 if (root->log_root) { 3337 free_log_tree(trans, root->log_root); 3338 root->log_root = NULL; 3339 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3340 } 3341 return 0; 3342} 3343 3344int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3345 struct btrfs_fs_info *fs_info) 3346{ 3347 if (fs_info->log_root_tree) { 3348 free_log_tree(trans, fs_info->log_root_tree); 3349 fs_info->log_root_tree = NULL; 3350 } 3351 return 0; 3352} 3353 3354/* 3355 * Check if an inode was logged in the current transaction. We can't always rely 3356 * on an inode's logged_trans value, because it's an in-memory only field and 3357 * therefore not persisted. This means that its value is lost if the inode gets 3358 * evicted and loaded again from disk (in which case it has a value of 0, and 3359 * certainly it is smaller then any possible transaction ID), when that happens 3360 * the full_sync flag is set in the inode's runtime flags, so on that case we 3361 * assume eviction happened and ignore the logged_trans value, assuming the 3362 * worst case, that the inode was logged before in the current transaction. 3363 */ 3364static bool inode_logged(struct btrfs_trans_handle *trans, 3365 struct btrfs_inode *inode) 3366{ 3367 if (inode->logged_trans == trans->transid) 3368 return true; 3369 3370 if (inode->last_trans == trans->transid && 3371 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 3372 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3373 return true; 3374 3375 return false; 3376} 3377 3378/* 3379 * If both a file and directory are logged, and unlinks or renames are 3380 * mixed in, we have a few interesting corners: 3381 * 3382 * create file X in dir Y 3383 * link file X to X.link in dir Y 3384 * fsync file X 3385 * unlink file X but leave X.link 3386 * fsync dir Y 3387 * 3388 * After a crash we would expect only X.link to exist. But file X 3389 * didn't get fsync'd again so the log has back refs for X and X.link. 3390 * 3391 * We solve this by removing directory entries and inode backrefs from the 3392 * log when a file that was logged in the current transaction is 3393 * unlinked. Any later fsync will include the updated log entries, and 3394 * we'll be able to reconstruct the proper directory items from backrefs. 3395 * 3396 * This optimizations allows us to avoid relogging the entire inode 3397 * or the entire directory. 3398 */ 3399int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3400 struct btrfs_root *root, 3401 const char *name, int name_len, 3402 struct btrfs_inode *dir, u64 index) 3403{ 3404 struct btrfs_root *log; 3405 struct btrfs_dir_item *di; 3406 struct btrfs_path *path; 3407 int ret; 3408 int err = 0; 3409 int bytes_del = 0; 3410 u64 dir_ino = btrfs_ino(dir); 3411 3412 if (!inode_logged(trans, dir)) 3413 return 0; 3414 3415 ret = join_running_log_trans(root); 3416 if (ret) 3417 return 0; 3418 3419 mutex_lock(&dir->log_mutex); 3420 3421 log = root->log_root; 3422 path = btrfs_alloc_path(); 3423 if (!path) { 3424 err = -ENOMEM; 3425 goto out_unlock; 3426 } 3427 3428 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3429 name, name_len, -1); 3430 if (IS_ERR(di)) { 3431 err = PTR_ERR(di); 3432 goto fail; 3433 } 3434 if (di) { 3435 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3436 bytes_del += name_len; 3437 if (ret) { 3438 err = ret; 3439 goto fail; 3440 } 3441 } 3442 btrfs_release_path(path); 3443 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3444 index, name, name_len, -1); 3445 if (IS_ERR(di)) { 3446 err = PTR_ERR(di); 3447 goto fail; 3448 } 3449 if (di) { 3450 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3451 bytes_del += name_len; 3452 if (ret) { 3453 err = ret; 3454 goto fail; 3455 } 3456 } 3457 3458 /* update the directory size in the log to reflect the names 3459 * we have removed 3460 */ 3461 if (bytes_del) { 3462 struct btrfs_key key; 3463 3464 key.objectid = dir_ino; 3465 key.offset = 0; 3466 key.type = BTRFS_INODE_ITEM_KEY; 3467 btrfs_release_path(path); 3468 3469 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 3470 if (ret < 0) { 3471 err = ret; 3472 goto fail; 3473 } 3474 if (ret == 0) { 3475 struct btrfs_inode_item *item; 3476 u64 i_size; 3477 3478 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3479 struct btrfs_inode_item); 3480 i_size = btrfs_inode_size(path->nodes[0], item); 3481 if (i_size > bytes_del) 3482 i_size -= bytes_del; 3483 else 3484 i_size = 0; 3485 btrfs_set_inode_size(path->nodes[0], item, i_size); 3486 btrfs_mark_buffer_dirty(path->nodes[0]); 3487 } else 3488 ret = 0; 3489 btrfs_release_path(path); 3490 } 3491fail: 3492 btrfs_free_path(path); 3493out_unlock: 3494 mutex_unlock(&dir->log_mutex); 3495 if (err == -ENOSPC) { 3496 btrfs_set_log_full_commit(trans); 3497 err = 0; 3498 } else if (err < 0 && err != -ENOENT) { 3499 /* ENOENT can be returned if the entry hasn't been fsynced yet */ 3500 btrfs_abort_transaction(trans, err); 3501 } 3502 3503 btrfs_end_log_trans(root); 3504 3505 return err; 3506} 3507 3508/* see comments for btrfs_del_dir_entries_in_log */ 3509int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3510 struct btrfs_root *root, 3511 const char *name, int name_len, 3512 struct btrfs_inode *inode, u64 dirid) 3513{ 3514 struct btrfs_root *log; 3515 u64 index; 3516 int ret; 3517 3518 if (!inode_logged(trans, inode)) 3519 return 0; 3520 3521 ret = join_running_log_trans(root); 3522 if (ret) 3523 return 0; 3524 log = root->log_root; 3525 mutex_lock(&inode->log_mutex); 3526 3527 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3528 dirid, &index); 3529 mutex_unlock(&inode->log_mutex); 3530 if (ret == -ENOSPC) { 3531 btrfs_set_log_full_commit(trans); 3532 ret = 0; 3533 } else if (ret < 0 && ret != -ENOENT) 3534 btrfs_abort_transaction(trans, ret); 3535 btrfs_end_log_trans(root); 3536 3537 return ret; 3538} 3539 3540/* 3541 * creates a range item in the log for 'dirid'. first_offset and 3542 * last_offset tell us which parts of the key space the log should 3543 * be considered authoritative for. 3544 */ 3545static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3546 struct btrfs_root *log, 3547 struct btrfs_path *path, 3548 int key_type, u64 dirid, 3549 u64 first_offset, u64 last_offset) 3550{ 3551 int ret; 3552 struct btrfs_key key; 3553 struct btrfs_dir_log_item *item; 3554 3555 key.objectid = dirid; 3556 key.offset = first_offset; 3557 if (key_type == BTRFS_DIR_ITEM_KEY) 3558 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3559 else 3560 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3561 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3562 if (ret) 3563 return ret; 3564 3565 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3566 struct btrfs_dir_log_item); 3567 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3568 btrfs_mark_buffer_dirty(path->nodes[0]); 3569 btrfs_release_path(path); 3570 return 0; 3571} 3572 3573/* 3574 * log all the items included in the current transaction for a given 3575 * directory. This also creates the range items in the log tree required 3576 * to replay anything deleted before the fsync 3577 */ 3578static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3579 struct btrfs_root *root, struct btrfs_inode *inode, 3580 struct btrfs_path *path, 3581 struct btrfs_path *dst_path, int key_type, 3582 struct btrfs_log_ctx *ctx, 3583 u64 min_offset, u64 *last_offset_ret) 3584{ 3585 struct btrfs_key min_key; 3586 struct btrfs_root *log = root->log_root; 3587 struct extent_buffer *src; 3588 int err = 0; 3589 int ret; 3590 int i; 3591 int nritems; 3592 u64 first_offset = min_offset; 3593 u64 last_offset = (u64)-1; 3594 u64 ino = btrfs_ino(inode); 3595 3596 log = root->log_root; 3597 3598 min_key.objectid = ino; 3599 min_key.type = key_type; 3600 min_key.offset = min_offset; 3601 3602 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3603 3604 /* 3605 * we didn't find anything from this transaction, see if there 3606 * is anything at all 3607 */ 3608 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3609 min_key.objectid = ino; 3610 min_key.type = key_type; 3611 min_key.offset = (u64)-1; 3612 btrfs_release_path(path); 3613 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3614 if (ret < 0) { 3615 btrfs_release_path(path); 3616 return ret; 3617 } 3618 ret = btrfs_previous_item(root, path, ino, key_type); 3619 3620 /* if ret == 0 there are items for this type, 3621 * create a range to tell us the last key of this type. 3622 * otherwise, there are no items in this directory after 3623 * *min_offset, and we create a range to indicate that. 3624 */ 3625 if (ret == 0) { 3626 struct btrfs_key tmp; 3627 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3628 path->slots[0]); 3629 if (key_type == tmp.type) 3630 first_offset = max(min_offset, tmp.offset) + 1; 3631 } 3632 goto done; 3633 } 3634 3635 /* go backward to find any previous key */ 3636 ret = btrfs_previous_item(root, path, ino, key_type); 3637 if (ret == 0) { 3638 struct btrfs_key tmp; 3639 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3640 if (key_type == tmp.type) { 3641 first_offset = tmp.offset; 3642 ret = overwrite_item(trans, log, dst_path, 3643 path->nodes[0], path->slots[0], 3644 &tmp); 3645 if (ret) { 3646 err = ret; 3647 goto done; 3648 } 3649 } 3650 } 3651 btrfs_release_path(path); 3652 3653 /* 3654 * Find the first key from this transaction again. See the note for 3655 * log_new_dir_dentries, if we're logging a directory recursively we 3656 * won't be holding its i_mutex, which means we can modify the directory 3657 * while we're logging it. If we remove an entry between our first 3658 * search and this search we'll not find the key again and can just 3659 * bail. 3660 */ 3661search: 3662 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3663 if (ret != 0) 3664 goto done; 3665 3666 /* 3667 * we have a block from this transaction, log every item in it 3668 * from our directory 3669 */ 3670 while (1) { 3671 struct btrfs_key tmp; 3672 src = path->nodes[0]; 3673 nritems = btrfs_header_nritems(src); 3674 for (i = path->slots[0]; i < nritems; i++) { 3675 struct btrfs_dir_item *di; 3676 3677 btrfs_item_key_to_cpu(src, &min_key, i); 3678 3679 if (min_key.objectid != ino || min_key.type != key_type) 3680 goto done; 3681 3682 if (need_resched()) { 3683 btrfs_release_path(path); 3684 cond_resched(); 3685 goto search; 3686 } 3687 3688 ret = overwrite_item(trans, log, dst_path, src, i, 3689 &min_key); 3690 if (ret) { 3691 err = ret; 3692 goto done; 3693 } 3694 3695 /* 3696 * We must make sure that when we log a directory entry, 3697 * the corresponding inode, after log replay, has a 3698 * matching link count. For example: 3699 * 3700 * touch foo 3701 * mkdir mydir 3702 * sync 3703 * ln foo mydir/bar 3704 * xfs_io -c "fsync" mydir 3705 * <crash> 3706 * <mount fs and log replay> 3707 * 3708 * Would result in a fsync log that when replayed, our 3709 * file inode would have a link count of 1, but we get 3710 * two directory entries pointing to the same inode. 3711 * After removing one of the names, it would not be 3712 * possible to remove the other name, which resulted 3713 * always in stale file handle errors, and would not 3714 * be possible to rmdir the parent directory, since 3715 * its i_size could never decrement to the value 3716 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3717 */ 3718 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3719 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3720 if (ctx && 3721 (btrfs_dir_transid(src, di) == trans->transid || 3722 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3723 tmp.type != BTRFS_ROOT_ITEM_KEY) 3724 ctx->log_new_dentries = true; 3725 } 3726 path->slots[0] = nritems; 3727 3728 /* 3729 * look ahead to the next item and see if it is also 3730 * from this directory and from this transaction 3731 */ 3732 ret = btrfs_next_leaf(root, path); 3733 if (ret) { 3734 if (ret == 1) 3735 last_offset = (u64)-1; 3736 else 3737 err = ret; 3738 goto done; 3739 } 3740 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3741 if (tmp.objectid != ino || tmp.type != key_type) { 3742 last_offset = (u64)-1; 3743 goto done; 3744 } 3745 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3746 ret = overwrite_item(trans, log, dst_path, 3747 path->nodes[0], path->slots[0], 3748 &tmp); 3749 if (ret) 3750 err = ret; 3751 else 3752 last_offset = tmp.offset; 3753 goto done; 3754 } 3755 } 3756done: 3757 btrfs_release_path(path); 3758 btrfs_release_path(dst_path); 3759 3760 if (err == 0) { 3761 *last_offset_ret = last_offset; 3762 /* 3763 * insert the log range keys to indicate where the log 3764 * is valid 3765 */ 3766 ret = insert_dir_log_key(trans, log, path, key_type, 3767 ino, first_offset, last_offset); 3768 if (ret) 3769 err = ret; 3770 } 3771 return err; 3772} 3773 3774/* 3775 * logging directories is very similar to logging inodes, We find all the items 3776 * from the current transaction and write them to the log. 3777 * 3778 * The recovery code scans the directory in the subvolume, and if it finds a 3779 * key in the range logged that is not present in the log tree, then it means 3780 * that dir entry was unlinked during the transaction. 3781 * 3782 * In order for that scan to work, we must include one key smaller than 3783 * the smallest logged by this transaction and one key larger than the largest 3784 * key logged by this transaction. 3785 */ 3786static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3787 struct btrfs_root *root, struct btrfs_inode *inode, 3788 struct btrfs_path *path, 3789 struct btrfs_path *dst_path, 3790 struct btrfs_log_ctx *ctx) 3791{ 3792 u64 min_key; 3793 u64 max_key; 3794 int ret; 3795 int key_type = BTRFS_DIR_ITEM_KEY; 3796 3797again: 3798 min_key = 0; 3799 max_key = 0; 3800 while (1) { 3801 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3802 ctx, min_key, &max_key); 3803 if (ret) 3804 return ret; 3805 if (max_key == (u64)-1) 3806 break; 3807 min_key = max_key + 1; 3808 } 3809 3810 if (key_type == BTRFS_DIR_ITEM_KEY) { 3811 key_type = BTRFS_DIR_INDEX_KEY; 3812 goto again; 3813 } 3814 return 0; 3815} 3816 3817/* 3818 * a helper function to drop items from the log before we relog an 3819 * inode. max_key_type indicates the highest item type to remove. 3820 * This cannot be run for file data extents because it does not 3821 * free the extents they point to. 3822 */ 3823static int drop_objectid_items(struct btrfs_trans_handle *trans, 3824 struct btrfs_root *log, 3825 struct btrfs_path *path, 3826 u64 objectid, int max_key_type) 3827{ 3828 int ret; 3829 struct btrfs_key key; 3830 struct btrfs_key found_key; 3831 int start_slot; 3832 3833 key.objectid = objectid; 3834 key.type = max_key_type; 3835 key.offset = (u64)-1; 3836 3837 while (1) { 3838 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3839 BUG_ON(ret == 0); /* Logic error */ 3840 if (ret < 0) 3841 break; 3842 3843 if (path->slots[0] == 0) 3844 break; 3845 3846 path->slots[0]--; 3847 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3848 path->slots[0]); 3849 3850 if (found_key.objectid != objectid) 3851 break; 3852 3853 found_key.offset = 0; 3854 found_key.type = 0; 3855 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 3856 if (ret < 0) 3857 break; 3858 3859 ret = btrfs_del_items(trans, log, path, start_slot, 3860 path->slots[0] - start_slot + 1); 3861 /* 3862 * If start slot isn't 0 then we don't need to re-search, we've 3863 * found the last guy with the objectid in this tree. 3864 */ 3865 if (ret || start_slot != 0) 3866 break; 3867 btrfs_release_path(path); 3868 } 3869 btrfs_release_path(path); 3870 if (ret > 0) 3871 ret = 0; 3872 return ret; 3873} 3874 3875static void fill_inode_item(struct btrfs_trans_handle *trans, 3876 struct extent_buffer *leaf, 3877 struct btrfs_inode_item *item, 3878 struct inode *inode, int log_inode_only, 3879 u64 logged_isize) 3880{ 3881 struct btrfs_map_token token; 3882 3883 btrfs_init_map_token(&token, leaf); 3884 3885 if (log_inode_only) { 3886 /* set the generation to zero so the recover code 3887 * can tell the difference between an logging 3888 * just to say 'this inode exists' and a logging 3889 * to say 'update this inode with these values' 3890 */ 3891 btrfs_set_token_inode_generation(&token, item, 0); 3892 btrfs_set_token_inode_size(&token, item, logged_isize); 3893 } else { 3894 btrfs_set_token_inode_generation(&token, item, 3895 BTRFS_I(inode)->generation); 3896 btrfs_set_token_inode_size(&token, item, inode->i_size); 3897 } 3898 3899 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3900 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3901 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3902 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3903 3904 btrfs_set_token_timespec_sec(&token, &item->atime, 3905 inode->i_atime.tv_sec); 3906 btrfs_set_token_timespec_nsec(&token, &item->atime, 3907 inode->i_atime.tv_nsec); 3908 3909 btrfs_set_token_timespec_sec(&token, &item->mtime, 3910 inode->i_mtime.tv_sec); 3911 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3912 inode->i_mtime.tv_nsec); 3913 3914 btrfs_set_token_timespec_sec(&token, &item->ctime, 3915 inode->i_ctime.tv_sec); 3916 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3917 inode->i_ctime.tv_nsec); 3918 3919 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3920 3921 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3922 btrfs_set_token_inode_transid(&token, item, trans->transid); 3923 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3924 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); 3925 btrfs_set_token_inode_block_group(&token, item, 0); 3926} 3927 3928static int log_inode_item(struct btrfs_trans_handle *trans, 3929 struct btrfs_root *log, struct btrfs_path *path, 3930 struct btrfs_inode *inode) 3931{ 3932 struct btrfs_inode_item *inode_item; 3933 int ret; 3934 3935 ret = btrfs_insert_empty_item(trans, log, path, 3936 &inode->location, sizeof(*inode_item)); 3937 if (ret && ret != -EEXIST) 3938 return ret; 3939 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3940 struct btrfs_inode_item); 3941 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 3942 0, 0); 3943 btrfs_release_path(path); 3944 return 0; 3945} 3946 3947static int log_csums(struct btrfs_trans_handle *trans, 3948 struct btrfs_inode *inode, 3949 struct btrfs_root *log_root, 3950 struct btrfs_ordered_sum *sums) 3951{ 3952 const u64 lock_end = sums->bytenr + sums->len - 1; 3953 struct extent_state *cached_state = NULL; 3954 int ret; 3955 3956 /* 3957 * If this inode was not used for reflink operations in the current 3958 * transaction with new extents, then do the fast path, no need to 3959 * worry about logging checksum items with overlapping ranges. 3960 */ 3961 if (inode->last_reflink_trans < trans->transid) 3962 return btrfs_csum_file_blocks(trans, log_root, sums); 3963 3964 /* 3965 * Serialize logging for checksums. This is to avoid racing with the 3966 * same checksum being logged by another task that is logging another 3967 * file which happens to refer to the same extent as well. Such races 3968 * can leave checksum items in the log with overlapping ranges. 3969 */ 3970 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 3971 lock_end, &cached_state); 3972 if (ret) 3973 return ret; 3974 /* 3975 * Due to extent cloning, we might have logged a csum item that covers a 3976 * subrange of a cloned extent, and later we can end up logging a csum 3977 * item for a larger subrange of the same extent or the entire range. 3978 * This would leave csum items in the log tree that cover the same range 3979 * and break the searches for checksums in the log tree, resulting in 3980 * some checksums missing in the fs/subvolume tree. So just delete (or 3981 * trim and adjust) any existing csum items in the log for this range. 3982 */ 3983 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 3984 if (!ret) 3985 ret = btrfs_csum_file_blocks(trans, log_root, sums); 3986 3987 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 3988 &cached_state); 3989 3990 return ret; 3991} 3992 3993static noinline int copy_items(struct btrfs_trans_handle *trans, 3994 struct btrfs_inode *inode, 3995 struct btrfs_path *dst_path, 3996 struct btrfs_path *src_path, 3997 int start_slot, int nr, int inode_only, 3998 u64 logged_isize) 3999{ 4000 struct btrfs_fs_info *fs_info = trans->fs_info; 4001 unsigned long src_offset; 4002 unsigned long dst_offset; 4003 struct btrfs_root *log = inode->root->log_root; 4004 struct btrfs_file_extent_item *extent; 4005 struct btrfs_inode_item *inode_item; 4006 struct extent_buffer *src = src_path->nodes[0]; 4007 int ret; 4008 struct btrfs_key *ins_keys; 4009 u32 *ins_sizes; 4010 char *ins_data; 4011 int i; 4012 struct list_head ordered_sums; 4013 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 4014 4015 INIT_LIST_HEAD(&ordered_sums); 4016 4017 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4018 nr * sizeof(u32), GFP_NOFS); 4019 if (!ins_data) 4020 return -ENOMEM; 4021 4022 ins_sizes = (u32 *)ins_data; 4023 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4024 4025 for (i = 0; i < nr; i++) { 4026 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 4027 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 4028 } 4029 ret = btrfs_insert_empty_items(trans, log, dst_path, 4030 ins_keys, ins_sizes, nr); 4031 if (ret) { 4032 kfree(ins_data); 4033 return ret; 4034 } 4035 4036 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 4037 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 4038 dst_path->slots[0]); 4039 4040 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 4041 4042 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 4043 inode_item = btrfs_item_ptr(dst_path->nodes[0], 4044 dst_path->slots[0], 4045 struct btrfs_inode_item); 4046 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4047 &inode->vfs_inode, 4048 inode_only == LOG_INODE_EXISTS, 4049 logged_isize); 4050 } else { 4051 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4052 src_offset, ins_sizes[i]); 4053 } 4054 4055 /* take a reference on file data extents so that truncates 4056 * or deletes of this inode don't have to relog the inode 4057 * again 4058 */ 4059 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4060 !skip_csum) { 4061 int found_type; 4062 extent = btrfs_item_ptr(src, start_slot + i, 4063 struct btrfs_file_extent_item); 4064 4065 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4066 continue; 4067 4068 found_type = btrfs_file_extent_type(src, extent); 4069 if (found_type == BTRFS_FILE_EXTENT_REG) { 4070 u64 ds, dl, cs, cl; 4071 ds = btrfs_file_extent_disk_bytenr(src, 4072 extent); 4073 /* ds == 0 is a hole */ 4074 if (ds == 0) 4075 continue; 4076 4077 dl = btrfs_file_extent_disk_num_bytes(src, 4078 extent); 4079 cs = btrfs_file_extent_offset(src, extent); 4080 cl = btrfs_file_extent_num_bytes(src, 4081 extent); 4082 if (btrfs_file_extent_compression(src, 4083 extent)) { 4084 cs = 0; 4085 cl = dl; 4086 } 4087 4088 ret = btrfs_lookup_csums_range( 4089 fs_info->csum_root, 4090 ds + cs, ds + cs + cl - 1, 4091 &ordered_sums, 0); 4092 if (ret) 4093 break; 4094 } 4095 } 4096 } 4097 4098 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4099 btrfs_release_path(dst_path); 4100 kfree(ins_data); 4101 4102 /* 4103 * we have to do this after the loop above to avoid changing the 4104 * log tree while trying to change the log tree. 4105 */ 4106 while (!list_empty(&ordered_sums)) { 4107 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4108 struct btrfs_ordered_sum, 4109 list); 4110 if (!ret) 4111 ret = log_csums(trans, inode, log, sums); 4112 list_del(&sums->list); 4113 kfree(sums); 4114 } 4115 4116 return ret; 4117} 4118 4119static int extent_cmp(void *priv, const struct list_head *a, 4120 const struct list_head *b) 4121{ 4122 struct extent_map *em1, *em2; 4123 4124 em1 = list_entry(a, struct extent_map, list); 4125 em2 = list_entry(b, struct extent_map, list); 4126 4127 if (em1->start < em2->start) 4128 return -1; 4129 else if (em1->start > em2->start) 4130 return 1; 4131 return 0; 4132} 4133 4134static int log_extent_csums(struct btrfs_trans_handle *trans, 4135 struct btrfs_inode *inode, 4136 struct btrfs_root *log_root, 4137 const struct extent_map *em, 4138 struct btrfs_log_ctx *ctx) 4139{ 4140 struct btrfs_ordered_extent *ordered; 4141 u64 csum_offset; 4142 u64 csum_len; 4143 u64 mod_start = em->mod_start; 4144 u64 mod_len = em->mod_len; 4145 LIST_HEAD(ordered_sums); 4146 int ret = 0; 4147 4148 if (inode->flags & BTRFS_INODE_NODATASUM || 4149 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4150 em->block_start == EXTENT_MAP_HOLE) 4151 return 0; 4152 4153 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4154 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4155 const u64 mod_end = mod_start + mod_len; 4156 struct btrfs_ordered_sum *sums; 4157 4158 if (mod_len == 0) 4159 break; 4160 4161 if (ordered_end <= mod_start) 4162 continue; 4163 if (mod_end <= ordered->file_offset) 4164 break; 4165 4166 /* 4167 * We are going to copy all the csums on this ordered extent, so 4168 * go ahead and adjust mod_start and mod_len in case this ordered 4169 * extent has already been logged. 4170 */ 4171 if (ordered->file_offset > mod_start) { 4172 if (ordered_end >= mod_end) 4173 mod_len = ordered->file_offset - mod_start; 4174 /* 4175 * If we have this case 4176 * 4177 * |--------- logged extent ---------| 4178 * |----- ordered extent ----| 4179 * 4180 * Just don't mess with mod_start and mod_len, we'll 4181 * just end up logging more csums than we need and it 4182 * will be ok. 4183 */ 4184 } else { 4185 if (ordered_end < mod_end) { 4186 mod_len = mod_end - ordered_end; 4187 mod_start = ordered_end; 4188 } else { 4189 mod_len = 0; 4190 } 4191 } 4192 4193 /* 4194 * To keep us from looping for the above case of an ordered 4195 * extent that falls inside of the logged extent. 4196 */ 4197 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4198 continue; 4199 4200 list_for_each_entry(sums, &ordered->list, list) { 4201 ret = log_csums(trans, inode, log_root, sums); 4202 if (ret) 4203 return ret; 4204 } 4205 } 4206 4207 /* We're done, found all csums in the ordered extents. */ 4208 if (mod_len == 0) 4209 return 0; 4210 4211 /* If we're compressed we have to save the entire range of csums. */ 4212 if (em->compress_type) { 4213 csum_offset = 0; 4214 csum_len = max(em->block_len, em->orig_block_len); 4215 } else { 4216 csum_offset = mod_start - em->start; 4217 csum_len = mod_len; 4218 } 4219 4220 /* block start is already adjusted for the file extent offset. */ 4221 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4222 em->block_start + csum_offset, 4223 em->block_start + csum_offset + 4224 csum_len - 1, &ordered_sums, 0); 4225 if (ret) 4226 return ret; 4227 4228 while (!list_empty(&ordered_sums)) { 4229 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4230 struct btrfs_ordered_sum, 4231 list); 4232 if (!ret) 4233 ret = log_csums(trans, inode, log_root, sums); 4234 list_del(&sums->list); 4235 kfree(sums); 4236 } 4237 4238 return ret; 4239} 4240 4241static int log_one_extent(struct btrfs_trans_handle *trans, 4242 struct btrfs_inode *inode, struct btrfs_root *root, 4243 const struct extent_map *em, 4244 struct btrfs_path *path, 4245 struct btrfs_log_ctx *ctx) 4246{ 4247 struct btrfs_root *log = root->log_root; 4248 struct btrfs_file_extent_item *fi; 4249 struct extent_buffer *leaf; 4250 struct btrfs_map_token token; 4251 struct btrfs_key key; 4252 u64 extent_offset = em->start - em->orig_start; 4253 u64 block_len; 4254 int ret; 4255 int extent_inserted = 0; 4256 4257 ret = log_extent_csums(trans, inode, log, em, ctx); 4258 if (ret) 4259 return ret; 4260 4261 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 4262 em->start + em->len, NULL, 0, 1, 4263 sizeof(*fi), &extent_inserted); 4264 if (ret) 4265 return ret; 4266 4267 if (!extent_inserted) { 4268 key.objectid = btrfs_ino(inode); 4269 key.type = BTRFS_EXTENT_DATA_KEY; 4270 key.offset = em->start; 4271 4272 ret = btrfs_insert_empty_item(trans, log, path, &key, 4273 sizeof(*fi)); 4274 if (ret) 4275 return ret; 4276 } 4277 leaf = path->nodes[0]; 4278 btrfs_init_map_token(&token, leaf); 4279 fi = btrfs_item_ptr(leaf, path->slots[0], 4280 struct btrfs_file_extent_item); 4281 4282 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4283 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4284 btrfs_set_token_file_extent_type(&token, fi, 4285 BTRFS_FILE_EXTENT_PREALLOC); 4286 else 4287 btrfs_set_token_file_extent_type(&token, fi, 4288 BTRFS_FILE_EXTENT_REG); 4289 4290 block_len = max(em->block_len, em->orig_block_len); 4291 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4292 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4293 em->block_start); 4294 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4295 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4296 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4297 em->block_start - 4298 extent_offset); 4299 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4300 } else { 4301 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4302 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4303 } 4304 4305 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4306 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4307 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4308 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4309 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4310 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4311 btrfs_mark_buffer_dirty(leaf); 4312 4313 btrfs_release_path(path); 4314 4315 return ret; 4316} 4317 4318/* 4319 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4320 * lose them after doing a full/fast fsync and replaying the log. We scan the 4321 * subvolume's root instead of iterating the inode's extent map tree because 4322 * otherwise we can log incorrect extent items based on extent map conversion. 4323 * That can happen due to the fact that extent maps are merged when they 4324 * are not in the extent map tree's list of modified extents. 4325 */ 4326static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4327 struct btrfs_inode *inode, 4328 struct btrfs_path *path) 4329{ 4330 struct btrfs_root *root = inode->root; 4331 struct btrfs_key key; 4332 const u64 i_size = i_size_read(&inode->vfs_inode); 4333 const u64 ino = btrfs_ino(inode); 4334 struct btrfs_path *dst_path = NULL; 4335 bool dropped_extents = false; 4336 u64 truncate_offset = i_size; 4337 struct extent_buffer *leaf; 4338 int slot; 4339 int ins_nr = 0; 4340 int start_slot = 0; 4341 int ret; 4342 4343 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4344 return 0; 4345 4346 key.objectid = ino; 4347 key.type = BTRFS_EXTENT_DATA_KEY; 4348 key.offset = i_size; 4349 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4350 if (ret < 0) 4351 goto out; 4352 4353 /* 4354 * We must check if there is a prealloc extent that starts before the 4355 * i_size and crosses the i_size boundary. This is to ensure later we 4356 * truncate down to the end of that extent and not to the i_size, as 4357 * otherwise we end up losing part of the prealloc extent after a log 4358 * replay and with an implicit hole if there is another prealloc extent 4359 * that starts at an offset beyond i_size. 4360 */ 4361 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4362 if (ret < 0) 4363 goto out; 4364 4365 if (ret == 0) { 4366 struct btrfs_file_extent_item *ei; 4367 4368 leaf = path->nodes[0]; 4369 slot = path->slots[0]; 4370 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4371 4372 if (btrfs_file_extent_type(leaf, ei) == 4373 BTRFS_FILE_EXTENT_PREALLOC) { 4374 u64 extent_end; 4375 4376 btrfs_item_key_to_cpu(leaf, &key, slot); 4377 extent_end = key.offset + 4378 btrfs_file_extent_num_bytes(leaf, ei); 4379 4380 if (extent_end > i_size) 4381 truncate_offset = extent_end; 4382 } 4383 } else { 4384 ret = 0; 4385 } 4386 4387 while (true) { 4388 leaf = path->nodes[0]; 4389 slot = path->slots[0]; 4390 4391 if (slot >= btrfs_header_nritems(leaf)) { 4392 if (ins_nr > 0) { 4393 ret = copy_items(trans, inode, dst_path, path, 4394 start_slot, ins_nr, 1, 0); 4395 if (ret < 0) 4396 goto out; 4397 ins_nr = 0; 4398 } 4399 ret = btrfs_next_leaf(root, path); 4400 if (ret < 0) 4401 goto out; 4402 if (ret > 0) { 4403 ret = 0; 4404 break; 4405 } 4406 continue; 4407 } 4408 4409 btrfs_item_key_to_cpu(leaf, &key, slot); 4410 if (key.objectid > ino) 4411 break; 4412 if (WARN_ON_ONCE(key.objectid < ino) || 4413 key.type < BTRFS_EXTENT_DATA_KEY || 4414 key.offset < i_size) { 4415 path->slots[0]++; 4416 continue; 4417 } 4418 if (!dropped_extents) { 4419 /* 4420 * Avoid logging extent items logged in past fsync calls 4421 * and leading to duplicate keys in the log tree. 4422 */ 4423 do { 4424 ret = btrfs_truncate_inode_items(trans, 4425 root->log_root, 4426 &inode->vfs_inode, 4427 truncate_offset, 4428 BTRFS_EXTENT_DATA_KEY); 4429 } while (ret == -EAGAIN); 4430 if (ret) 4431 goto out; 4432 dropped_extents = true; 4433 } 4434 if (ins_nr == 0) 4435 start_slot = slot; 4436 ins_nr++; 4437 path->slots[0]++; 4438 if (!dst_path) { 4439 dst_path = btrfs_alloc_path(); 4440 if (!dst_path) { 4441 ret = -ENOMEM; 4442 goto out; 4443 } 4444 } 4445 } 4446 if (ins_nr > 0) 4447 ret = copy_items(trans, inode, dst_path, path, 4448 start_slot, ins_nr, 1, 0); 4449out: 4450 btrfs_release_path(path); 4451 btrfs_free_path(dst_path); 4452 return ret; 4453} 4454 4455static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4456 struct btrfs_root *root, 4457 struct btrfs_inode *inode, 4458 struct btrfs_path *path, 4459 struct btrfs_log_ctx *ctx) 4460{ 4461 struct btrfs_ordered_extent *ordered; 4462 struct btrfs_ordered_extent *tmp; 4463 struct extent_map *em, *n; 4464 struct list_head extents; 4465 struct extent_map_tree *tree = &inode->extent_tree; 4466 u64 test_gen; 4467 int ret = 0; 4468 int num = 0; 4469 4470 INIT_LIST_HEAD(&extents); 4471 4472 write_lock(&tree->lock); 4473 test_gen = root->fs_info->last_trans_committed; 4474 4475 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4476 list_del_init(&em->list); 4477 /* 4478 * Just an arbitrary number, this can be really CPU intensive 4479 * once we start getting a lot of extents, and really once we 4480 * have a bunch of extents we just want to commit since it will 4481 * be faster. 4482 */ 4483 if (++num > 32768) { 4484 list_del_init(&tree->modified_extents); 4485 ret = -EFBIG; 4486 goto process; 4487 } 4488 4489 if (em->generation <= test_gen) 4490 continue; 4491 4492 /* We log prealloc extents beyond eof later. */ 4493 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4494 em->start >= i_size_read(&inode->vfs_inode)) 4495 continue; 4496 4497 /* Need a ref to keep it from getting evicted from cache */ 4498 refcount_inc(&em->refs); 4499 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4500 list_add_tail(&em->list, &extents); 4501 num++; 4502 } 4503 4504 list_sort(NULL, &extents, extent_cmp); 4505process: 4506 while (!list_empty(&extents)) { 4507 em = list_entry(extents.next, struct extent_map, list); 4508 4509 list_del_init(&em->list); 4510 4511 /* 4512 * If we had an error we just need to delete everybody from our 4513 * private list. 4514 */ 4515 if (ret) { 4516 clear_em_logging(tree, em); 4517 free_extent_map(em); 4518 continue; 4519 } 4520 4521 write_unlock(&tree->lock); 4522 4523 ret = log_one_extent(trans, inode, root, em, path, ctx); 4524 write_lock(&tree->lock); 4525 clear_em_logging(tree, em); 4526 free_extent_map(em); 4527 } 4528 WARN_ON(!list_empty(&extents)); 4529 write_unlock(&tree->lock); 4530 4531 btrfs_release_path(path); 4532 if (!ret) 4533 ret = btrfs_log_prealloc_extents(trans, inode, path); 4534 if (ret) 4535 return ret; 4536 4537 /* 4538 * We have logged all extents successfully, now make sure the commit of 4539 * the current transaction waits for the ordered extents to complete 4540 * before it commits and wipes out the log trees, otherwise we would 4541 * lose data if an ordered extents completes after the transaction 4542 * commits and a power failure happens after the transaction commit. 4543 */ 4544 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4545 list_del_init(&ordered->log_list); 4546 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4547 4548 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4549 spin_lock_irq(&inode->ordered_tree.lock); 4550 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4551 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4552 atomic_inc(&trans->transaction->pending_ordered); 4553 } 4554 spin_unlock_irq(&inode->ordered_tree.lock); 4555 } 4556 btrfs_put_ordered_extent(ordered); 4557 } 4558 4559 return 0; 4560} 4561 4562static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4563 struct btrfs_path *path, u64 *size_ret) 4564{ 4565 struct btrfs_key key; 4566 int ret; 4567 4568 key.objectid = btrfs_ino(inode); 4569 key.type = BTRFS_INODE_ITEM_KEY; 4570 key.offset = 0; 4571 4572 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4573 if (ret < 0) { 4574 return ret; 4575 } else if (ret > 0) { 4576 *size_ret = 0; 4577 } else { 4578 struct btrfs_inode_item *item; 4579 4580 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4581 struct btrfs_inode_item); 4582 *size_ret = btrfs_inode_size(path->nodes[0], item); 4583 /* 4584 * If the in-memory inode's i_size is smaller then the inode 4585 * size stored in the btree, return the inode's i_size, so 4586 * that we get a correct inode size after replaying the log 4587 * when before a power failure we had a shrinking truncate 4588 * followed by addition of a new name (rename / new hard link). 4589 * Otherwise return the inode size from the btree, to avoid 4590 * data loss when replaying a log due to previously doing a 4591 * write that expands the inode's size and logging a new name 4592 * immediately after. 4593 */ 4594 if (*size_ret > inode->vfs_inode.i_size) 4595 *size_ret = inode->vfs_inode.i_size; 4596 } 4597 4598 btrfs_release_path(path); 4599 return 0; 4600} 4601 4602/* 4603 * At the moment we always log all xattrs. This is to figure out at log replay 4604 * time which xattrs must have their deletion replayed. If a xattr is missing 4605 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4606 * because if a xattr is deleted, the inode is fsynced and a power failure 4607 * happens, causing the log to be replayed the next time the fs is mounted, 4608 * we want the xattr to not exist anymore (same behaviour as other filesystems 4609 * with a journal, ext3/4, xfs, f2fs, etc). 4610 */ 4611static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4612 struct btrfs_root *root, 4613 struct btrfs_inode *inode, 4614 struct btrfs_path *path, 4615 struct btrfs_path *dst_path) 4616{ 4617 int ret; 4618 struct btrfs_key key; 4619 const u64 ino = btrfs_ino(inode); 4620 int ins_nr = 0; 4621 int start_slot = 0; 4622 bool found_xattrs = false; 4623 4624 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 4625 return 0; 4626 4627 key.objectid = ino; 4628 key.type = BTRFS_XATTR_ITEM_KEY; 4629 key.offset = 0; 4630 4631 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4632 if (ret < 0) 4633 return ret; 4634 4635 while (true) { 4636 int slot = path->slots[0]; 4637 struct extent_buffer *leaf = path->nodes[0]; 4638 int nritems = btrfs_header_nritems(leaf); 4639 4640 if (slot >= nritems) { 4641 if (ins_nr > 0) { 4642 ret = copy_items(trans, inode, dst_path, path, 4643 start_slot, ins_nr, 1, 0); 4644 if (ret < 0) 4645 return ret; 4646 ins_nr = 0; 4647 } 4648 ret = btrfs_next_leaf(root, path); 4649 if (ret < 0) 4650 return ret; 4651 else if (ret > 0) 4652 break; 4653 continue; 4654 } 4655 4656 btrfs_item_key_to_cpu(leaf, &key, slot); 4657 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4658 break; 4659 4660 if (ins_nr == 0) 4661 start_slot = slot; 4662 ins_nr++; 4663 path->slots[0]++; 4664 found_xattrs = true; 4665 cond_resched(); 4666 } 4667 if (ins_nr > 0) { 4668 ret = copy_items(trans, inode, dst_path, path, 4669 start_slot, ins_nr, 1, 0); 4670 if (ret < 0) 4671 return ret; 4672 } 4673 4674 if (!found_xattrs) 4675 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 4676 4677 return 0; 4678} 4679 4680/* 4681 * When using the NO_HOLES feature if we punched a hole that causes the 4682 * deletion of entire leafs or all the extent items of the first leaf (the one 4683 * that contains the inode item and references) we may end up not processing 4684 * any extents, because there are no leafs with a generation matching the 4685 * current transaction that have extent items for our inode. So we need to find 4686 * if any holes exist and then log them. We also need to log holes after any 4687 * truncate operation that changes the inode's size. 4688 */ 4689static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4690 struct btrfs_root *root, 4691 struct btrfs_inode *inode, 4692 struct btrfs_path *path) 4693{ 4694 struct btrfs_fs_info *fs_info = root->fs_info; 4695 struct btrfs_key key; 4696 const u64 ino = btrfs_ino(inode); 4697 const u64 i_size = i_size_read(&inode->vfs_inode); 4698 u64 prev_extent_end = 0; 4699 int ret; 4700 4701 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4702 return 0; 4703 4704 key.objectid = ino; 4705 key.type = BTRFS_EXTENT_DATA_KEY; 4706 key.offset = 0; 4707 4708 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4709 if (ret < 0) 4710 return ret; 4711 4712 while (true) { 4713 struct extent_buffer *leaf = path->nodes[0]; 4714 4715 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4716 ret = btrfs_next_leaf(root, path); 4717 if (ret < 0) 4718 return ret; 4719 if (ret > 0) { 4720 ret = 0; 4721 break; 4722 } 4723 leaf = path->nodes[0]; 4724 } 4725 4726 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4727 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4728 break; 4729 4730 /* We have a hole, log it. */ 4731 if (prev_extent_end < key.offset) { 4732 const u64 hole_len = key.offset - prev_extent_end; 4733 4734 /* 4735 * Release the path to avoid deadlocks with other code 4736 * paths that search the root while holding locks on 4737 * leafs from the log root. 4738 */ 4739 btrfs_release_path(path); 4740 ret = btrfs_insert_file_extent(trans, root->log_root, 4741 ino, prev_extent_end, 0, 4742 0, hole_len, 0, hole_len, 4743 0, 0, 0); 4744 if (ret < 0) 4745 return ret; 4746 4747 /* 4748 * Search for the same key again in the root. Since it's 4749 * an extent item and we are holding the inode lock, the 4750 * key must still exist. If it doesn't just emit warning 4751 * and return an error to fall back to a transaction 4752 * commit. 4753 */ 4754 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4755 if (ret < 0) 4756 return ret; 4757 if (WARN_ON(ret > 0)) 4758 return -ENOENT; 4759 leaf = path->nodes[0]; 4760 } 4761 4762 prev_extent_end = btrfs_file_extent_end(path); 4763 path->slots[0]++; 4764 cond_resched(); 4765 } 4766 4767 if (prev_extent_end < i_size) { 4768 u64 hole_len; 4769 4770 btrfs_release_path(path); 4771 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 4772 ret = btrfs_insert_file_extent(trans, root->log_root, 4773 ino, prev_extent_end, 0, 0, 4774 hole_len, 0, hole_len, 4775 0, 0, 0); 4776 if (ret < 0) 4777 return ret; 4778 } 4779 4780 return 0; 4781} 4782 4783/* 4784 * When we are logging a new inode X, check if it doesn't have a reference that 4785 * matches the reference from some other inode Y created in a past transaction 4786 * and that was renamed in the current transaction. If we don't do this, then at 4787 * log replay time we can lose inode Y (and all its files if it's a directory): 4788 * 4789 * mkdir /mnt/x 4790 * echo "hello world" > /mnt/x/foobar 4791 * sync 4792 * mv /mnt/x /mnt/y 4793 * mkdir /mnt/x # or touch /mnt/x 4794 * xfs_io -c fsync /mnt/x 4795 * <power fail> 4796 * mount fs, trigger log replay 4797 * 4798 * After the log replay procedure, we would lose the first directory and all its 4799 * files (file foobar). 4800 * For the case where inode Y is not a directory we simply end up losing it: 4801 * 4802 * echo "123" > /mnt/foo 4803 * sync 4804 * mv /mnt/foo /mnt/bar 4805 * echo "abc" > /mnt/foo 4806 * xfs_io -c fsync /mnt/foo 4807 * <power fail> 4808 * 4809 * We also need this for cases where a snapshot entry is replaced by some other 4810 * entry (file or directory) otherwise we end up with an unreplayable log due to 4811 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4812 * if it were a regular entry: 4813 * 4814 * mkdir /mnt/x 4815 * btrfs subvolume snapshot /mnt /mnt/x/snap 4816 * btrfs subvolume delete /mnt/x/snap 4817 * rmdir /mnt/x 4818 * mkdir /mnt/x 4819 * fsync /mnt/x or fsync some new file inside it 4820 * <power fail> 4821 * 4822 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4823 * the same transaction. 4824 */ 4825static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4826 const int slot, 4827 const struct btrfs_key *key, 4828 struct btrfs_inode *inode, 4829 u64 *other_ino, u64 *other_parent) 4830{ 4831 int ret; 4832 struct btrfs_path *search_path; 4833 char *name = NULL; 4834 u32 name_len = 0; 4835 u32 item_size = btrfs_item_size_nr(eb, slot); 4836 u32 cur_offset = 0; 4837 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4838 4839 search_path = btrfs_alloc_path(); 4840 if (!search_path) 4841 return -ENOMEM; 4842 search_path->search_commit_root = 1; 4843 search_path->skip_locking = 1; 4844 4845 while (cur_offset < item_size) { 4846 u64 parent; 4847 u32 this_name_len; 4848 u32 this_len; 4849 unsigned long name_ptr; 4850 struct btrfs_dir_item *di; 4851 4852 if (key->type == BTRFS_INODE_REF_KEY) { 4853 struct btrfs_inode_ref *iref; 4854 4855 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4856 parent = key->offset; 4857 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4858 name_ptr = (unsigned long)(iref + 1); 4859 this_len = sizeof(*iref) + this_name_len; 4860 } else { 4861 struct btrfs_inode_extref *extref; 4862 4863 extref = (struct btrfs_inode_extref *)(ptr + 4864 cur_offset); 4865 parent = btrfs_inode_extref_parent(eb, extref); 4866 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4867 name_ptr = (unsigned long)&extref->name; 4868 this_len = sizeof(*extref) + this_name_len; 4869 } 4870 4871 if (this_name_len > name_len) { 4872 char *new_name; 4873 4874 new_name = krealloc(name, this_name_len, GFP_NOFS); 4875 if (!new_name) { 4876 ret = -ENOMEM; 4877 goto out; 4878 } 4879 name_len = this_name_len; 4880 name = new_name; 4881 } 4882 4883 read_extent_buffer(eb, name, name_ptr, this_name_len); 4884 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4885 parent, name, this_name_len, 0); 4886 if (di && !IS_ERR(di)) { 4887 struct btrfs_key di_key; 4888 4889 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4890 di, &di_key); 4891 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4892 if (di_key.objectid != key->objectid) { 4893 ret = 1; 4894 *other_ino = di_key.objectid; 4895 *other_parent = parent; 4896 } else { 4897 ret = 0; 4898 } 4899 } else { 4900 ret = -EAGAIN; 4901 } 4902 goto out; 4903 } else if (IS_ERR(di)) { 4904 ret = PTR_ERR(di); 4905 goto out; 4906 } 4907 btrfs_release_path(search_path); 4908 4909 cur_offset += this_len; 4910 } 4911 ret = 0; 4912out: 4913 btrfs_free_path(search_path); 4914 kfree(name); 4915 return ret; 4916} 4917 4918struct btrfs_ino_list { 4919 u64 ino; 4920 u64 parent; 4921 struct list_head list; 4922}; 4923 4924static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 4925 struct btrfs_root *root, 4926 struct btrfs_path *path, 4927 struct btrfs_log_ctx *ctx, 4928 u64 ino, u64 parent) 4929{ 4930 struct btrfs_ino_list *ino_elem; 4931 LIST_HEAD(inode_list); 4932 int ret = 0; 4933 4934 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 4935 if (!ino_elem) 4936 return -ENOMEM; 4937 ino_elem->ino = ino; 4938 ino_elem->parent = parent; 4939 list_add_tail(&ino_elem->list, &inode_list); 4940 4941 while (!list_empty(&inode_list)) { 4942 struct btrfs_fs_info *fs_info = root->fs_info; 4943 struct btrfs_key key; 4944 struct inode *inode; 4945 4946 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 4947 list); 4948 ino = ino_elem->ino; 4949 parent = ino_elem->parent; 4950 list_del(&ino_elem->list); 4951 kfree(ino_elem); 4952 if (ret) 4953 continue; 4954 4955 btrfs_release_path(path); 4956 4957 inode = btrfs_iget(fs_info->sb, ino, root); 4958 /* 4959 * If the other inode that had a conflicting dir entry was 4960 * deleted in the current transaction, we need to log its parent 4961 * directory. 4962 */ 4963 if (IS_ERR(inode)) { 4964 ret = PTR_ERR(inode); 4965 if (ret == -ENOENT) { 4966 inode = btrfs_iget(fs_info->sb, parent, root); 4967 if (IS_ERR(inode)) { 4968 ret = PTR_ERR(inode); 4969 } else { 4970 ret = btrfs_log_inode(trans, root, 4971 BTRFS_I(inode), 4972 LOG_OTHER_INODE_ALL, 4973 ctx); 4974 btrfs_add_delayed_iput(inode); 4975 } 4976 } 4977 continue; 4978 } 4979 /* 4980 * If the inode was already logged skip it - otherwise we can 4981 * hit an infinite loop. Example: 4982 * 4983 * From the commit root (previous transaction) we have the 4984 * following inodes: 4985 * 4986 * inode 257 a directory 4987 * inode 258 with references "zz" and "zz_link" on inode 257 4988 * inode 259 with reference "a" on inode 257 4989 * 4990 * And in the current (uncommitted) transaction we have: 4991 * 4992 * inode 257 a directory, unchanged 4993 * inode 258 with references "a" and "a2" on inode 257 4994 * inode 259 with reference "zz_link" on inode 257 4995 * inode 261 with reference "zz" on inode 257 4996 * 4997 * When logging inode 261 the following infinite loop could 4998 * happen if we don't skip already logged inodes: 4999 * 5000 * - we detect inode 258 as a conflicting inode, with inode 261 5001 * on reference "zz", and log it; 5002 * 5003 * - we detect inode 259 as a conflicting inode, with inode 258 5004 * on reference "a", and log it; 5005 * 5006 * - we detect inode 258 as a conflicting inode, with inode 259 5007 * on reference "zz_link", and log it - again! After this we 5008 * repeat the above steps forever. 5009 */ 5010 spin_lock(&BTRFS_I(inode)->lock); 5011 /* 5012 * Check the inode's logged_trans only instead of 5013 * btrfs_inode_in_log(). This is because the last_log_commit of 5014 * the inode is not updated when we only log that it exists and 5015 * it has the full sync bit set (see btrfs_log_inode()). 5016 */ 5017 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5018 spin_unlock(&BTRFS_I(inode)->lock); 5019 btrfs_add_delayed_iput(inode); 5020 continue; 5021 } 5022 spin_unlock(&BTRFS_I(inode)->lock); 5023 /* 5024 * We are safe logging the other inode without acquiring its 5025 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5026 * are safe against concurrent renames of the other inode as 5027 * well because during a rename we pin the log and update the 5028 * log with the new name before we unpin it. 5029 */ 5030 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5031 LOG_OTHER_INODE, ctx); 5032 if (ret) { 5033 btrfs_add_delayed_iput(inode); 5034 continue; 5035 } 5036 5037 key.objectid = ino; 5038 key.type = BTRFS_INODE_REF_KEY; 5039 key.offset = 0; 5040 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5041 if (ret < 0) { 5042 btrfs_add_delayed_iput(inode); 5043 continue; 5044 } 5045 5046 while (true) { 5047 struct extent_buffer *leaf = path->nodes[0]; 5048 int slot = path->slots[0]; 5049 u64 other_ino = 0; 5050 u64 other_parent = 0; 5051 5052 if (slot >= btrfs_header_nritems(leaf)) { 5053 ret = btrfs_next_leaf(root, path); 5054 if (ret < 0) { 5055 break; 5056 } else if (ret > 0) { 5057 ret = 0; 5058 break; 5059 } 5060 continue; 5061 } 5062 5063 btrfs_item_key_to_cpu(leaf, &key, slot); 5064 if (key.objectid != ino || 5065 (key.type != BTRFS_INODE_REF_KEY && 5066 key.type != BTRFS_INODE_EXTREF_KEY)) { 5067 ret = 0; 5068 break; 5069 } 5070 5071 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5072 BTRFS_I(inode), &other_ino, 5073 &other_parent); 5074 if (ret < 0) 5075 break; 5076 if (ret > 0) { 5077 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5078 if (!ino_elem) { 5079 ret = -ENOMEM; 5080 break; 5081 } 5082 ino_elem->ino = other_ino; 5083 ino_elem->parent = other_parent; 5084 list_add_tail(&ino_elem->list, &inode_list); 5085 ret = 0; 5086 } 5087 path->slots[0]++; 5088 } 5089 btrfs_add_delayed_iput(inode); 5090 } 5091 5092 return ret; 5093} 5094 5095static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5096 struct btrfs_inode *inode, 5097 struct btrfs_key *min_key, 5098 const struct btrfs_key *max_key, 5099 struct btrfs_path *path, 5100 struct btrfs_path *dst_path, 5101 const u64 logged_isize, 5102 const bool recursive_logging, 5103 const int inode_only, 5104 struct btrfs_log_ctx *ctx, 5105 bool *need_log_inode_item) 5106{ 5107 const u64 i_size = i_size_read(&inode->vfs_inode); 5108 struct btrfs_root *root = inode->root; 5109 int ins_start_slot = 0; 5110 int ins_nr = 0; 5111 int ret; 5112 5113 while (1) { 5114 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5115 if (ret < 0) 5116 return ret; 5117 if (ret > 0) { 5118 ret = 0; 5119 break; 5120 } 5121again: 5122 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5123 if (min_key->objectid != max_key->objectid) 5124 break; 5125 if (min_key->type > max_key->type) 5126 break; 5127 5128 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5129 *need_log_inode_item = false; 5130 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5131 min_key->offset >= i_size) { 5132 /* 5133 * Extents at and beyond eof are logged with 5134 * btrfs_log_prealloc_extents(). 5135 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5136 * and no keys greater than that, so bail out. 5137 */ 5138 break; 5139 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5140 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5141 inode->generation == trans->transid && 5142 !recursive_logging) { 5143 u64 other_ino = 0; 5144 u64 other_parent = 0; 5145 5146 ret = btrfs_check_ref_name_override(path->nodes[0], 5147 path->slots[0], min_key, inode, 5148 &other_ino, &other_parent); 5149 if (ret < 0) { 5150 return ret; 5151 } else if (ret > 0 && ctx && 5152 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5153 if (ins_nr > 0) { 5154 ins_nr++; 5155 } else { 5156 ins_nr = 1; 5157 ins_start_slot = path->slots[0]; 5158 } 5159 ret = copy_items(trans, inode, dst_path, path, 5160 ins_start_slot, ins_nr, 5161 inode_only, logged_isize); 5162 if (ret < 0) 5163 return ret; 5164 ins_nr = 0; 5165 5166 ret = log_conflicting_inodes(trans, root, path, 5167 ctx, other_ino, other_parent); 5168 if (ret) 5169 return ret; 5170 btrfs_release_path(path); 5171 goto next_key; 5172 } 5173 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5174 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5175 if (ins_nr == 0) 5176 goto next_slot; 5177 ret = copy_items(trans, inode, dst_path, path, 5178 ins_start_slot, 5179 ins_nr, inode_only, logged_isize); 5180 if (ret < 0) 5181 return ret; 5182 ins_nr = 0; 5183 goto next_slot; 5184 } 5185 5186 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5187 ins_nr++; 5188 goto next_slot; 5189 } else if (!ins_nr) { 5190 ins_start_slot = path->slots[0]; 5191 ins_nr = 1; 5192 goto next_slot; 5193 } 5194 5195 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5196 ins_nr, inode_only, logged_isize); 5197 if (ret < 0) 5198 return ret; 5199 ins_nr = 1; 5200 ins_start_slot = path->slots[0]; 5201next_slot: 5202 path->slots[0]++; 5203 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5204 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5205 path->slots[0]); 5206 goto again; 5207 } 5208 if (ins_nr) { 5209 ret = copy_items(trans, inode, dst_path, path, 5210 ins_start_slot, ins_nr, inode_only, 5211 logged_isize); 5212 if (ret < 0) 5213 return ret; 5214 ins_nr = 0; 5215 } 5216 btrfs_release_path(path); 5217next_key: 5218 if (min_key->offset < (u64)-1) { 5219 min_key->offset++; 5220 } else if (min_key->type < max_key->type) { 5221 min_key->type++; 5222 min_key->offset = 0; 5223 } else { 5224 break; 5225 } 5226 } 5227 if (ins_nr) { 5228 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5229 ins_nr, inode_only, logged_isize); 5230 if (ret) 5231 return ret; 5232 } 5233 5234 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5235 /* 5236 * Release the path because otherwise we might attempt to double 5237 * lock the same leaf with btrfs_log_prealloc_extents() below. 5238 */ 5239 btrfs_release_path(path); 5240 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5241 } 5242 5243 return ret; 5244} 5245 5246/* log a single inode in the tree log. 5247 * At least one parent directory for this inode must exist in the tree 5248 * or be logged already. 5249 * 5250 * Any items from this inode changed by the current transaction are copied 5251 * to the log tree. An extra reference is taken on any extents in this 5252 * file, allowing us to avoid a whole pile of corner cases around logging 5253 * blocks that have been removed from the tree. 5254 * 5255 * See LOG_INODE_ALL and related defines for a description of what inode_only 5256 * does. 5257 * 5258 * This handles both files and directories. 5259 */ 5260static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5261 struct btrfs_root *root, struct btrfs_inode *inode, 5262 int inode_only, 5263 struct btrfs_log_ctx *ctx) 5264{ 5265 struct btrfs_path *path; 5266 struct btrfs_path *dst_path; 5267 struct btrfs_key min_key; 5268 struct btrfs_key max_key; 5269 struct btrfs_root *log = root->log_root; 5270 int err = 0; 5271 int ret = 0; 5272 bool fast_search = false; 5273 u64 ino = btrfs_ino(inode); 5274 struct extent_map_tree *em_tree = &inode->extent_tree; 5275 u64 logged_isize = 0; 5276 bool need_log_inode_item = true; 5277 bool xattrs_logged = false; 5278 bool recursive_logging = false; 5279 5280 path = btrfs_alloc_path(); 5281 if (!path) 5282 return -ENOMEM; 5283 dst_path = btrfs_alloc_path(); 5284 if (!dst_path) { 5285 btrfs_free_path(path); 5286 return -ENOMEM; 5287 } 5288 5289 min_key.objectid = ino; 5290 min_key.type = BTRFS_INODE_ITEM_KEY; 5291 min_key.offset = 0; 5292 5293 max_key.objectid = ino; 5294 5295 5296 /* today the code can only do partial logging of directories */ 5297 if (S_ISDIR(inode->vfs_inode.i_mode) || 5298 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5299 &inode->runtime_flags) && 5300 inode_only >= LOG_INODE_EXISTS)) 5301 max_key.type = BTRFS_XATTR_ITEM_KEY; 5302 else 5303 max_key.type = (u8)-1; 5304 max_key.offset = (u64)-1; 5305 5306 /* 5307 * Only run delayed items if we are a directory. We want to make sure 5308 * all directory indexes hit the fs/subvolume tree so we can find them 5309 * and figure out which index ranges have to be logged. 5310 * 5311 * Otherwise commit the delayed inode only if the full sync flag is set, 5312 * as we want to make sure an up to date version is in the subvolume 5313 * tree so copy_inode_items_to_log() / copy_items() can find it and copy 5314 * it to the log tree. For a non full sync, we always log the inode item 5315 * based on the in-memory struct btrfs_inode which is always up to date. 5316 */ 5317 if (S_ISDIR(inode->vfs_inode.i_mode)) 5318 ret = btrfs_commit_inode_delayed_items(trans, inode); 5319 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5320 ret = btrfs_commit_inode_delayed_inode(inode); 5321 5322 if (ret) { 5323 btrfs_free_path(path); 5324 btrfs_free_path(dst_path); 5325 return ret; 5326 } 5327 5328 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5329 recursive_logging = true; 5330 if (inode_only == LOG_OTHER_INODE) 5331 inode_only = LOG_INODE_EXISTS; 5332 else 5333 inode_only = LOG_INODE_ALL; 5334 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5335 } else { 5336 mutex_lock(&inode->log_mutex); 5337 } 5338 5339 /* 5340 * For symlinks, we must always log their content, which is stored in an 5341 * inline extent, otherwise we could end up with an empty symlink after 5342 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 5343 * one attempts to create an empty symlink). 5344 * We don't need to worry about flushing delalloc, because when we create 5345 * the inline extent when the symlink is created (we never have delalloc 5346 * for symlinks). 5347 */ 5348 if (S_ISLNK(inode->vfs_inode.i_mode)) 5349 inode_only = LOG_INODE_ALL; 5350 5351 /* 5352 * a brute force approach to making sure we get the most uptodate 5353 * copies of everything. 5354 */ 5355 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5356 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5357 5358 if (inode_only == LOG_INODE_EXISTS) 5359 max_key_type = BTRFS_XATTR_ITEM_KEY; 5360 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5361 } else { 5362 if (inode_only == LOG_INODE_EXISTS) { 5363 /* 5364 * Make sure the new inode item we write to the log has 5365 * the same isize as the current one (if it exists). 5366 * This is necessary to prevent data loss after log 5367 * replay, and also to prevent doing a wrong expanding 5368 * truncate - for e.g. create file, write 4K into offset 5369 * 0, fsync, write 4K into offset 4096, add hard link, 5370 * fsync some other file (to sync log), power fail - if 5371 * we use the inode's current i_size, after log replay 5372 * we get a 8Kb file, with the last 4Kb extent as a hole 5373 * (zeroes), as if an expanding truncate happened, 5374 * instead of getting a file of 4Kb only. 5375 */ 5376 err = logged_inode_size(log, inode, path, &logged_isize); 5377 if (err) 5378 goto out_unlock; 5379 } 5380 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5381 &inode->runtime_flags)) { 5382 if (inode_only == LOG_INODE_EXISTS) { 5383 max_key.type = BTRFS_XATTR_ITEM_KEY; 5384 ret = drop_objectid_items(trans, log, path, ino, 5385 max_key.type); 5386 } else { 5387 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5388 &inode->runtime_flags); 5389 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5390 &inode->runtime_flags); 5391 while(1) { 5392 ret = btrfs_truncate_inode_items(trans, 5393 log, &inode->vfs_inode, 0, 0); 5394 if (ret != -EAGAIN) 5395 break; 5396 } 5397 } 5398 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5399 &inode->runtime_flags) || 5400 inode_only == LOG_INODE_EXISTS) { 5401 if (inode_only == LOG_INODE_ALL) 5402 fast_search = true; 5403 max_key.type = BTRFS_XATTR_ITEM_KEY; 5404 ret = drop_objectid_items(trans, log, path, ino, 5405 max_key.type); 5406 } else { 5407 if (inode_only == LOG_INODE_ALL) 5408 fast_search = true; 5409 goto log_extents; 5410 } 5411 5412 } 5413 if (ret) { 5414 err = ret; 5415 goto out_unlock; 5416 } 5417 5418 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5419 path, dst_path, logged_isize, 5420 recursive_logging, inode_only, ctx, 5421 &need_log_inode_item); 5422 if (err) 5423 goto out_unlock; 5424 5425 btrfs_release_path(path); 5426 btrfs_release_path(dst_path); 5427 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5428 if (err) 5429 goto out_unlock; 5430 xattrs_logged = true; 5431 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5432 btrfs_release_path(path); 5433 btrfs_release_path(dst_path); 5434 err = btrfs_log_holes(trans, root, inode, path); 5435 if (err) 5436 goto out_unlock; 5437 } 5438log_extents: 5439 btrfs_release_path(path); 5440 btrfs_release_path(dst_path); 5441 if (need_log_inode_item) { 5442 err = log_inode_item(trans, log, dst_path, inode); 5443 if (!err && !xattrs_logged) { 5444 err = btrfs_log_all_xattrs(trans, root, inode, path, 5445 dst_path); 5446 btrfs_release_path(path); 5447 } 5448 if (err) 5449 goto out_unlock; 5450 } 5451 if (fast_search) { 5452 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5453 ctx); 5454 if (ret) { 5455 err = ret; 5456 goto out_unlock; 5457 } 5458 } else if (inode_only == LOG_INODE_ALL) { 5459 struct extent_map *em, *n; 5460 5461 write_lock(&em_tree->lock); 5462 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5463 list_del_init(&em->list); 5464 write_unlock(&em_tree->lock); 5465 } 5466 5467 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5468 ret = log_directory_changes(trans, root, inode, path, dst_path, 5469 ctx); 5470 if (ret) { 5471 err = ret; 5472 goto out_unlock; 5473 } 5474 } 5475 5476 /* 5477 * If we are logging that an ancestor inode exists as part of logging a 5478 * new name from a link or rename operation, don't mark the inode as 5479 * logged - otherwise if an explicit fsync is made against an ancestor, 5480 * the fsync considers the inode in the log and doesn't sync the log, 5481 * resulting in the ancestor missing after a power failure unless the 5482 * log was synced as part of an fsync against any other unrelated inode. 5483 * So keep it simple for this case and just don't flag the ancestors as 5484 * logged. 5485 */ 5486 if (!ctx || 5487 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && 5488 &inode->vfs_inode != ctx->inode)) { 5489 spin_lock(&inode->lock); 5490 inode->logged_trans = trans->transid; 5491 /* 5492 * Don't update last_log_commit if we logged that an inode exists 5493 * after it was loaded to memory (full_sync bit set). 5494 * This is to prevent data loss when we do a write to the inode, 5495 * then the inode gets evicted after all delalloc was flushed, 5496 * then we log it exists (due to a rename for example) and then 5497 * fsync it. This last fsync would do nothing (not logging the 5498 * extents previously written). 5499 */ 5500 if (inode_only != LOG_INODE_EXISTS || 5501 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5502 inode->last_log_commit = inode->last_sub_trans; 5503 spin_unlock(&inode->lock); 5504 } 5505out_unlock: 5506 mutex_unlock(&inode->log_mutex); 5507 5508 btrfs_free_path(path); 5509 btrfs_free_path(dst_path); 5510 return err; 5511} 5512 5513/* 5514 * Check if we must fallback to a transaction commit when logging an inode. 5515 * This must be called after logging the inode and is used only in the context 5516 * when fsyncing an inode requires the need to log some other inode - in which 5517 * case we can't lock the i_mutex of each other inode we need to log as that 5518 * can lead to deadlocks with concurrent fsync against other inodes (as we can 5519 * log inodes up or down in the hierarchy) or rename operations for example. So 5520 * we take the log_mutex of the inode after we have logged it and then check for 5521 * its last_unlink_trans value - this is safe because any task setting 5522 * last_unlink_trans must take the log_mutex and it must do this before it does 5523 * the actual unlink operation, so if we do this check before a concurrent task 5524 * sets last_unlink_trans it means we've logged a consistent version/state of 5525 * all the inode items, otherwise we are not sure and must do a transaction 5526 * commit (the concurrent task might have only updated last_unlink_trans before 5527 * we logged the inode or it might have also done the unlink). 5528 */ 5529static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, 5530 struct btrfs_inode *inode) 5531{ 5532 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5533 bool ret = false; 5534 5535 mutex_lock(&inode->log_mutex); 5536 if (inode->last_unlink_trans > fs_info->last_trans_committed) { 5537 /* 5538 * Make sure any commits to the log are forced to be full 5539 * commits. 5540 */ 5541 btrfs_set_log_full_commit(trans); 5542 ret = true; 5543 } 5544 mutex_unlock(&inode->log_mutex); 5545 5546 return ret; 5547} 5548 5549/* 5550 * follow the dentry parent pointers up the chain and see if any 5551 * of the directories in it require a full commit before they can 5552 * be logged. Returns zero if nothing special needs to be done or 1 if 5553 * a full commit is required. 5554 */ 5555static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 5556 struct btrfs_inode *inode, 5557 struct dentry *parent, 5558 struct super_block *sb, 5559 u64 last_committed) 5560{ 5561 int ret = 0; 5562 struct dentry *old_parent = NULL; 5563 5564 /* 5565 * for regular files, if its inode is already on disk, we don't 5566 * have to worry about the parents at all. This is because 5567 * we can use the last_unlink_trans field to record renames 5568 * and other fun in this file. 5569 */ 5570 if (S_ISREG(inode->vfs_inode.i_mode) && 5571 inode->generation <= last_committed && 5572 inode->last_unlink_trans <= last_committed) 5573 goto out; 5574 5575 if (!S_ISDIR(inode->vfs_inode.i_mode)) { 5576 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5577 goto out; 5578 inode = BTRFS_I(d_inode(parent)); 5579 } 5580 5581 while (1) { 5582 if (btrfs_must_commit_transaction(trans, inode)) { 5583 ret = 1; 5584 break; 5585 } 5586 5587 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5588 break; 5589 5590 if (IS_ROOT(parent)) { 5591 inode = BTRFS_I(d_inode(parent)); 5592 if (btrfs_must_commit_transaction(trans, inode)) 5593 ret = 1; 5594 break; 5595 } 5596 5597 parent = dget_parent(parent); 5598 dput(old_parent); 5599 old_parent = parent; 5600 inode = BTRFS_I(d_inode(parent)); 5601 5602 } 5603 dput(old_parent); 5604out: 5605 return ret; 5606} 5607 5608struct btrfs_dir_list { 5609 u64 ino; 5610 struct list_head list; 5611}; 5612 5613/* 5614 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5615 * details about the why it is needed. 5616 * This is a recursive operation - if an existing dentry corresponds to a 5617 * directory, that directory's new entries are logged too (same behaviour as 5618 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5619 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5620 * complains about the following circular lock dependency / possible deadlock: 5621 * 5622 * CPU0 CPU1 5623 * ---- ---- 5624 * lock(&type->i_mutex_dir_key#3/2); 5625 * lock(sb_internal#2); 5626 * lock(&type->i_mutex_dir_key#3/2); 5627 * lock(&sb->s_type->i_mutex_key#14); 5628 * 5629 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5630 * sb_start_intwrite() in btrfs_start_transaction(). 5631 * Not locking i_mutex of the inodes is still safe because: 5632 * 5633 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5634 * that while logging the inode new references (names) are added or removed 5635 * from the inode, leaving the logged inode item with a link count that does 5636 * not match the number of logged inode reference items. This is fine because 5637 * at log replay time we compute the real number of links and correct the 5638 * link count in the inode item (see replay_one_buffer() and 5639 * link_to_fixup_dir()); 5640 * 5641 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5642 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5643 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5644 * has a size that doesn't match the sum of the lengths of all the logged 5645 * names. This does not result in a problem because if a dir_item key is 5646 * logged but its matching dir_index key is not logged, at log replay time we 5647 * don't use it to replay the respective name (see replay_one_name()). On the 5648 * other hand if only the dir_index key ends up being logged, the respective 5649 * name is added to the fs/subvol tree with both the dir_item and dir_index 5650 * keys created (see replay_one_name()). 5651 * The directory's inode item with a wrong i_size is not a problem as well, 5652 * since we don't use it at log replay time to set the i_size in the inode 5653 * item of the fs/subvol tree (see overwrite_item()). 5654 */ 5655static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5656 struct btrfs_root *root, 5657 struct btrfs_inode *start_inode, 5658 struct btrfs_log_ctx *ctx) 5659{ 5660 struct btrfs_fs_info *fs_info = root->fs_info; 5661 struct btrfs_root *log = root->log_root; 5662 struct btrfs_path *path; 5663 LIST_HEAD(dir_list); 5664 struct btrfs_dir_list *dir_elem; 5665 int ret = 0; 5666 5667 path = btrfs_alloc_path(); 5668 if (!path) 5669 return -ENOMEM; 5670 5671 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5672 if (!dir_elem) { 5673 btrfs_free_path(path); 5674 return -ENOMEM; 5675 } 5676 dir_elem->ino = btrfs_ino(start_inode); 5677 list_add_tail(&dir_elem->list, &dir_list); 5678 5679 while (!list_empty(&dir_list)) { 5680 struct extent_buffer *leaf; 5681 struct btrfs_key min_key; 5682 int nritems; 5683 int i; 5684 5685 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5686 list); 5687 if (ret) 5688 goto next_dir_inode; 5689 5690 min_key.objectid = dir_elem->ino; 5691 min_key.type = BTRFS_DIR_ITEM_KEY; 5692 min_key.offset = 0; 5693again: 5694 btrfs_release_path(path); 5695 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5696 if (ret < 0) { 5697 goto next_dir_inode; 5698 } else if (ret > 0) { 5699 ret = 0; 5700 goto next_dir_inode; 5701 } 5702 5703process_leaf: 5704 leaf = path->nodes[0]; 5705 nritems = btrfs_header_nritems(leaf); 5706 for (i = path->slots[0]; i < nritems; i++) { 5707 struct btrfs_dir_item *di; 5708 struct btrfs_key di_key; 5709 struct inode *di_inode; 5710 struct btrfs_dir_list *new_dir_elem; 5711 int log_mode = LOG_INODE_EXISTS; 5712 int type; 5713 5714 btrfs_item_key_to_cpu(leaf, &min_key, i); 5715 if (min_key.objectid != dir_elem->ino || 5716 min_key.type != BTRFS_DIR_ITEM_KEY) 5717 goto next_dir_inode; 5718 5719 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5720 type = btrfs_dir_type(leaf, di); 5721 if (btrfs_dir_transid(leaf, di) < trans->transid && 5722 type != BTRFS_FT_DIR) 5723 continue; 5724 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5725 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5726 continue; 5727 5728 btrfs_release_path(path); 5729 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5730 if (IS_ERR(di_inode)) { 5731 ret = PTR_ERR(di_inode); 5732 goto next_dir_inode; 5733 } 5734 5735 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { 5736 btrfs_add_delayed_iput(di_inode); 5737 break; 5738 } 5739 5740 ctx->log_new_dentries = false; 5741 if (type == BTRFS_FT_DIR) 5742 log_mode = LOG_INODE_ALL; 5743 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5744 log_mode, ctx); 5745 if (!ret && 5746 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) 5747 ret = 1; 5748 btrfs_add_delayed_iput(di_inode); 5749 if (ret) 5750 goto next_dir_inode; 5751 if (ctx->log_new_dentries) { 5752 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5753 GFP_NOFS); 5754 if (!new_dir_elem) { 5755 ret = -ENOMEM; 5756 goto next_dir_inode; 5757 } 5758 new_dir_elem->ino = di_key.objectid; 5759 list_add_tail(&new_dir_elem->list, &dir_list); 5760 } 5761 break; 5762 } 5763 if (i == nritems) { 5764 ret = btrfs_next_leaf(log, path); 5765 if (ret < 0) { 5766 goto next_dir_inode; 5767 } else if (ret > 0) { 5768 ret = 0; 5769 goto next_dir_inode; 5770 } 5771 goto process_leaf; 5772 } 5773 if (min_key.offset < (u64)-1) { 5774 min_key.offset++; 5775 goto again; 5776 } 5777next_dir_inode: 5778 list_del(&dir_elem->list); 5779 kfree(dir_elem); 5780 } 5781 5782 btrfs_free_path(path); 5783 return ret; 5784} 5785 5786static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5787 struct btrfs_inode *inode, 5788 struct btrfs_log_ctx *ctx) 5789{ 5790 struct btrfs_fs_info *fs_info = trans->fs_info; 5791 int ret; 5792 struct btrfs_path *path; 5793 struct btrfs_key key; 5794 struct btrfs_root *root = inode->root; 5795 const u64 ino = btrfs_ino(inode); 5796 5797 path = btrfs_alloc_path(); 5798 if (!path) 5799 return -ENOMEM; 5800 path->skip_locking = 1; 5801 path->search_commit_root = 1; 5802 5803 key.objectid = ino; 5804 key.type = BTRFS_INODE_REF_KEY; 5805 key.offset = 0; 5806 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5807 if (ret < 0) 5808 goto out; 5809 5810 while (true) { 5811 struct extent_buffer *leaf = path->nodes[0]; 5812 int slot = path->slots[0]; 5813 u32 cur_offset = 0; 5814 u32 item_size; 5815 unsigned long ptr; 5816 5817 if (slot >= btrfs_header_nritems(leaf)) { 5818 ret = btrfs_next_leaf(root, path); 5819 if (ret < 0) 5820 goto out; 5821 else if (ret > 0) 5822 break; 5823 continue; 5824 } 5825 5826 btrfs_item_key_to_cpu(leaf, &key, slot); 5827 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5828 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5829 break; 5830 5831 item_size = btrfs_item_size_nr(leaf, slot); 5832 ptr = btrfs_item_ptr_offset(leaf, slot); 5833 while (cur_offset < item_size) { 5834 struct btrfs_key inode_key; 5835 struct inode *dir_inode; 5836 5837 inode_key.type = BTRFS_INODE_ITEM_KEY; 5838 inode_key.offset = 0; 5839 5840 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5841 struct btrfs_inode_extref *extref; 5842 5843 extref = (struct btrfs_inode_extref *) 5844 (ptr + cur_offset); 5845 inode_key.objectid = btrfs_inode_extref_parent( 5846 leaf, extref); 5847 cur_offset += sizeof(*extref); 5848 cur_offset += btrfs_inode_extref_name_len(leaf, 5849 extref); 5850 } else { 5851 inode_key.objectid = key.offset; 5852 cur_offset = item_size; 5853 } 5854 5855 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 5856 root); 5857 /* 5858 * If the parent inode was deleted, return an error to 5859 * fallback to a transaction commit. This is to prevent 5860 * getting an inode that was moved from one parent A to 5861 * a parent B, got its former parent A deleted and then 5862 * it got fsync'ed, from existing at both parents after 5863 * a log replay (and the old parent still existing). 5864 * Example: 5865 * 5866 * mkdir /mnt/A 5867 * mkdir /mnt/B 5868 * touch /mnt/B/bar 5869 * sync 5870 * mv /mnt/B/bar /mnt/A/bar 5871 * mv -T /mnt/A /mnt/B 5872 * fsync /mnt/B/bar 5873 * <power fail> 5874 * 5875 * If we ignore the old parent B which got deleted, 5876 * after a log replay we would have file bar linked 5877 * at both parents and the old parent B would still 5878 * exist. 5879 */ 5880 if (IS_ERR(dir_inode)) { 5881 ret = PTR_ERR(dir_inode); 5882 goto out; 5883 } 5884 5885 if (ctx) 5886 ctx->log_new_dentries = false; 5887 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5888 LOG_INODE_ALL, ctx); 5889 if (!ret && 5890 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) 5891 ret = 1; 5892 if (!ret && ctx && ctx->log_new_dentries) 5893 ret = log_new_dir_dentries(trans, root, 5894 BTRFS_I(dir_inode), ctx); 5895 btrfs_add_delayed_iput(dir_inode); 5896 if (ret) 5897 goto out; 5898 } 5899 path->slots[0]++; 5900 } 5901 ret = 0; 5902out: 5903 btrfs_free_path(path); 5904 return ret; 5905} 5906 5907static int log_new_ancestors(struct btrfs_trans_handle *trans, 5908 struct btrfs_root *root, 5909 struct btrfs_path *path, 5910 struct btrfs_log_ctx *ctx) 5911{ 5912 struct btrfs_key found_key; 5913 5914 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5915 5916 while (true) { 5917 struct btrfs_fs_info *fs_info = root->fs_info; 5918 const u64 last_committed = fs_info->last_trans_committed; 5919 struct extent_buffer *leaf = path->nodes[0]; 5920 int slot = path->slots[0]; 5921 struct btrfs_key search_key; 5922 struct inode *inode; 5923 u64 ino; 5924 int ret = 0; 5925 5926 btrfs_release_path(path); 5927 5928 ino = found_key.offset; 5929 5930 search_key.objectid = found_key.offset; 5931 search_key.type = BTRFS_INODE_ITEM_KEY; 5932 search_key.offset = 0; 5933 inode = btrfs_iget(fs_info->sb, ino, root); 5934 if (IS_ERR(inode)) 5935 return PTR_ERR(inode); 5936 5937 if (BTRFS_I(inode)->generation > last_committed) 5938 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5939 LOG_INODE_EXISTS, ctx); 5940 btrfs_add_delayed_iput(inode); 5941 if (ret) 5942 return ret; 5943 5944 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5945 break; 5946 5947 search_key.type = BTRFS_INODE_REF_KEY; 5948 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5949 if (ret < 0) 5950 return ret; 5951 5952 leaf = path->nodes[0]; 5953 slot = path->slots[0]; 5954 if (slot >= btrfs_header_nritems(leaf)) { 5955 ret = btrfs_next_leaf(root, path); 5956 if (ret < 0) 5957 return ret; 5958 else if (ret > 0) 5959 return -ENOENT; 5960 leaf = path->nodes[0]; 5961 slot = path->slots[0]; 5962 } 5963 5964 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5965 if (found_key.objectid != search_key.objectid || 5966 found_key.type != BTRFS_INODE_REF_KEY) 5967 return -ENOENT; 5968 } 5969 return 0; 5970} 5971 5972static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 5973 struct btrfs_inode *inode, 5974 struct dentry *parent, 5975 struct btrfs_log_ctx *ctx) 5976{ 5977 struct btrfs_root *root = inode->root; 5978 struct btrfs_fs_info *fs_info = root->fs_info; 5979 struct dentry *old_parent = NULL; 5980 struct super_block *sb = inode->vfs_inode.i_sb; 5981 int ret = 0; 5982 5983 while (true) { 5984 if (!parent || d_really_is_negative(parent) || 5985 sb != parent->d_sb) 5986 break; 5987 5988 inode = BTRFS_I(d_inode(parent)); 5989 if (root != inode->root) 5990 break; 5991 5992 if (inode->generation > fs_info->last_trans_committed) { 5993 ret = btrfs_log_inode(trans, root, inode, 5994 LOG_INODE_EXISTS, ctx); 5995 if (ret) 5996 break; 5997 } 5998 if (IS_ROOT(parent)) 5999 break; 6000 6001 parent = dget_parent(parent); 6002 dput(old_parent); 6003 old_parent = parent; 6004 } 6005 dput(old_parent); 6006 6007 return ret; 6008} 6009 6010static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6011 struct btrfs_inode *inode, 6012 struct dentry *parent, 6013 struct btrfs_log_ctx *ctx) 6014{ 6015 struct btrfs_root *root = inode->root; 6016 const u64 ino = btrfs_ino(inode); 6017 struct btrfs_path *path; 6018 struct btrfs_key search_key; 6019 int ret; 6020 6021 /* 6022 * For a single hard link case, go through a fast path that does not 6023 * need to iterate the fs/subvolume tree. 6024 */ 6025 if (inode->vfs_inode.i_nlink < 2) 6026 return log_new_ancestors_fast(trans, inode, parent, ctx); 6027 6028 path = btrfs_alloc_path(); 6029 if (!path) 6030 return -ENOMEM; 6031 6032 search_key.objectid = ino; 6033 search_key.type = BTRFS_INODE_REF_KEY; 6034 search_key.offset = 0; 6035again: 6036 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6037 if (ret < 0) 6038 goto out; 6039 if (ret == 0) 6040 path->slots[0]++; 6041 6042 while (true) { 6043 struct extent_buffer *leaf = path->nodes[0]; 6044 int slot = path->slots[0]; 6045 struct btrfs_key found_key; 6046 6047 if (slot >= btrfs_header_nritems(leaf)) { 6048 ret = btrfs_next_leaf(root, path); 6049 if (ret < 0) 6050 goto out; 6051 else if (ret > 0) 6052 break; 6053 continue; 6054 } 6055 6056 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6057 if (found_key.objectid != ino || 6058 found_key.type > BTRFS_INODE_EXTREF_KEY) 6059 break; 6060 6061 /* 6062 * Don't deal with extended references because they are rare 6063 * cases and too complex to deal with (we would need to keep 6064 * track of which subitem we are processing for each item in 6065 * this loop, etc). So just return some error to fallback to 6066 * a transaction commit. 6067 */ 6068 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6069 ret = -EMLINK; 6070 goto out; 6071 } 6072 6073 /* 6074 * Logging ancestors needs to do more searches on the fs/subvol 6075 * tree, so it releases the path as needed to avoid deadlocks. 6076 * Keep track of the last inode ref key and resume from that key 6077 * after logging all new ancestors for the current hard link. 6078 */ 6079 memcpy(&search_key, &found_key, sizeof(search_key)); 6080 6081 ret = log_new_ancestors(trans, root, path, ctx); 6082 if (ret) 6083 goto out; 6084 btrfs_release_path(path); 6085 goto again; 6086 } 6087 ret = 0; 6088out: 6089 btrfs_free_path(path); 6090 return ret; 6091} 6092 6093/* 6094 * helper function around btrfs_log_inode to make sure newly created 6095 * parent directories also end up in the log. A minimal inode and backref 6096 * only logging is done of any parent directories that are older than 6097 * the last committed transaction 6098 */ 6099static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6100 struct btrfs_inode *inode, 6101 struct dentry *parent, 6102 int inode_only, 6103 struct btrfs_log_ctx *ctx) 6104{ 6105 struct btrfs_root *root = inode->root; 6106 struct btrfs_fs_info *fs_info = root->fs_info; 6107 struct super_block *sb; 6108 int ret = 0; 6109 u64 last_committed = fs_info->last_trans_committed; 6110 bool log_dentries = false; 6111 6112 sb = inode->vfs_inode.i_sb; 6113 6114 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6115 ret = 1; 6116 goto end_no_trans; 6117 } 6118 6119 /* 6120 * The prev transaction commit doesn't complete, we need do 6121 * full commit by ourselves. 6122 */ 6123 if (fs_info->last_trans_log_full_commit > 6124 fs_info->last_trans_committed) { 6125 ret = 1; 6126 goto end_no_trans; 6127 } 6128 6129 if (btrfs_root_refs(&root->root_item) == 0) { 6130 ret = 1; 6131 goto end_no_trans; 6132 } 6133 6134 ret = check_parent_dirs_for_sync(trans, inode, parent, sb, 6135 last_committed); 6136 if (ret) 6137 goto end_no_trans; 6138 6139 /* 6140 * Skip already logged inodes or inodes corresponding to tmpfiles 6141 * (since logging them is pointless, a link count of 0 means they 6142 * will never be accessible). 6143 */ 6144 if ((btrfs_inode_in_log(inode, trans->transid) && 6145 list_empty(&ctx->ordered_extents)) || 6146 inode->vfs_inode.i_nlink == 0) { 6147 ret = BTRFS_NO_LOG_SYNC; 6148 goto end_no_trans; 6149 } 6150 6151 ret = start_log_trans(trans, root, ctx); 6152 if (ret) 6153 goto end_no_trans; 6154 6155 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); 6156 if (ret) 6157 goto end_trans; 6158 6159 /* 6160 * for regular files, if its inode is already on disk, we don't 6161 * have to worry about the parents at all. This is because 6162 * we can use the last_unlink_trans field to record renames 6163 * and other fun in this file. 6164 */ 6165 if (S_ISREG(inode->vfs_inode.i_mode) && 6166 inode->generation <= last_committed && 6167 inode->last_unlink_trans <= last_committed) { 6168 ret = 0; 6169 goto end_trans; 6170 } 6171 6172 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6173 log_dentries = true; 6174 6175 /* 6176 * On unlink we must make sure all our current and old parent directory 6177 * inodes are fully logged. This is to prevent leaving dangling 6178 * directory index entries in directories that were our parents but are 6179 * not anymore. Not doing this results in old parent directory being 6180 * impossible to delete after log replay (rmdir will always fail with 6181 * error -ENOTEMPTY). 6182 * 6183 * Example 1: 6184 * 6185 * mkdir testdir 6186 * touch testdir/foo 6187 * ln testdir/foo testdir/bar 6188 * sync 6189 * unlink testdir/bar 6190 * xfs_io -c fsync testdir/foo 6191 * <power failure> 6192 * mount fs, triggers log replay 6193 * 6194 * If we don't log the parent directory (testdir), after log replay the 6195 * directory still has an entry pointing to the file inode using the bar 6196 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6197 * the file inode has a link count of 1. 6198 * 6199 * Example 2: 6200 * 6201 * mkdir testdir 6202 * touch foo 6203 * ln foo testdir/foo2 6204 * ln foo testdir/foo3 6205 * sync 6206 * unlink testdir/foo3 6207 * xfs_io -c fsync foo 6208 * <power failure> 6209 * mount fs, triggers log replay 6210 * 6211 * Similar as the first example, after log replay the parent directory 6212 * testdir still has an entry pointing to the inode file with name foo3 6213 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6214 * and has a link count of 2. 6215 */ 6216 if (inode->last_unlink_trans > last_committed) { 6217 ret = btrfs_log_all_parents(trans, inode, ctx); 6218 if (ret) 6219 goto end_trans; 6220 } 6221 6222 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6223 if (ret) 6224 goto end_trans; 6225 6226 if (log_dentries) 6227 ret = log_new_dir_dentries(trans, root, inode, ctx); 6228 else 6229 ret = 0; 6230end_trans: 6231 if (ret < 0) { 6232 btrfs_set_log_full_commit(trans); 6233 ret = 1; 6234 } 6235 6236 if (ret) 6237 btrfs_remove_log_ctx(root, ctx); 6238 btrfs_end_log_trans(root); 6239end_no_trans: 6240 return ret; 6241} 6242 6243/* 6244 * it is not safe to log dentry if the chunk root has added new 6245 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6246 * If this returns 1, you must commit the transaction to safely get your 6247 * data on disk. 6248 */ 6249int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6250 struct dentry *dentry, 6251 struct btrfs_log_ctx *ctx) 6252{ 6253 struct dentry *parent = dget_parent(dentry); 6254 int ret; 6255 6256 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6257 LOG_INODE_ALL, ctx); 6258 dput(parent); 6259 6260 return ret; 6261} 6262 6263/* 6264 * should be called during mount to recover any replay any log trees 6265 * from the FS 6266 */ 6267int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6268{ 6269 int ret; 6270 struct btrfs_path *path; 6271 struct btrfs_trans_handle *trans; 6272 struct btrfs_key key; 6273 struct btrfs_key found_key; 6274 struct btrfs_root *log; 6275 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6276 struct walk_control wc = { 6277 .process_func = process_one_buffer, 6278 .stage = LOG_WALK_PIN_ONLY, 6279 }; 6280 6281 path = btrfs_alloc_path(); 6282 if (!path) 6283 return -ENOMEM; 6284 6285 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6286 6287 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6288 if (IS_ERR(trans)) { 6289 ret = PTR_ERR(trans); 6290 goto error; 6291 } 6292 6293 wc.trans = trans; 6294 wc.pin = 1; 6295 6296 ret = walk_log_tree(trans, log_root_tree, &wc); 6297 if (ret) { 6298 btrfs_handle_fs_error(fs_info, ret, 6299 "Failed to pin buffers while recovering log root tree."); 6300 goto error; 6301 } 6302 6303again: 6304 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6305 key.offset = (u64)-1; 6306 key.type = BTRFS_ROOT_ITEM_KEY; 6307 6308 while (1) { 6309 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6310 6311 if (ret < 0) { 6312 btrfs_handle_fs_error(fs_info, ret, 6313 "Couldn't find tree log root."); 6314 goto error; 6315 } 6316 if (ret > 0) { 6317 if (path->slots[0] == 0) 6318 break; 6319 path->slots[0]--; 6320 } 6321 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6322 path->slots[0]); 6323 btrfs_release_path(path); 6324 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6325 break; 6326 6327 log = btrfs_read_tree_root(log_root_tree, &found_key); 6328 if (IS_ERR(log)) { 6329 ret = PTR_ERR(log); 6330 btrfs_handle_fs_error(fs_info, ret, 6331 "Couldn't read tree log root."); 6332 goto error; 6333 } 6334 6335 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6336 true); 6337 if (IS_ERR(wc.replay_dest)) { 6338 ret = PTR_ERR(wc.replay_dest); 6339 6340 /* 6341 * We didn't find the subvol, likely because it was 6342 * deleted. This is ok, simply skip this log and go to 6343 * the next one. 6344 * 6345 * We need to exclude the root because we can't have 6346 * other log replays overwriting this log as we'll read 6347 * it back in a few more times. This will keep our 6348 * block from being modified, and we'll just bail for 6349 * each subsequent pass. 6350 */ 6351 if (ret == -ENOENT) 6352 ret = btrfs_pin_extent_for_log_replay(trans, 6353 log->node->start, 6354 log->node->len); 6355 btrfs_put_root(log); 6356 6357 if (!ret) 6358 goto next; 6359 btrfs_handle_fs_error(fs_info, ret, 6360 "Couldn't read target root for tree log recovery."); 6361 goto error; 6362 } 6363 6364 wc.replay_dest->log_root = log; 6365 btrfs_record_root_in_trans(trans, wc.replay_dest); 6366 ret = walk_log_tree(trans, log, &wc); 6367 6368 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6369 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6370 path); 6371 } 6372 6373 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6374 struct btrfs_root *root = wc.replay_dest; 6375 6376 btrfs_release_path(path); 6377 6378 /* 6379 * We have just replayed everything, and the highest 6380 * objectid of fs roots probably has changed in case 6381 * some inode_item's got replayed. 6382 * 6383 * root->objectid_mutex is not acquired as log replay 6384 * could only happen during mount. 6385 */ 6386 ret = btrfs_find_highest_objectid(root, 6387 &root->highest_objectid); 6388 } 6389 6390 wc.replay_dest->log_root = NULL; 6391 btrfs_put_root(wc.replay_dest); 6392 btrfs_put_root(log); 6393 6394 if (ret) 6395 goto error; 6396next: 6397 if (found_key.offset == 0) 6398 break; 6399 key.offset = found_key.offset - 1; 6400 } 6401 btrfs_release_path(path); 6402 6403 /* step one is to pin it all, step two is to replay just inodes */ 6404 if (wc.pin) { 6405 wc.pin = 0; 6406 wc.process_func = replay_one_buffer; 6407 wc.stage = LOG_WALK_REPLAY_INODES; 6408 goto again; 6409 } 6410 /* step three is to replay everything */ 6411 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6412 wc.stage++; 6413 goto again; 6414 } 6415 6416 btrfs_free_path(path); 6417 6418 /* step 4: commit the transaction, which also unpins the blocks */ 6419 ret = btrfs_commit_transaction(trans); 6420 if (ret) 6421 return ret; 6422 6423 log_root_tree->log_root = NULL; 6424 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6425 btrfs_put_root(log_root_tree); 6426 6427 return 0; 6428error: 6429 if (wc.trans) 6430 btrfs_end_transaction(wc.trans); 6431 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6432 btrfs_free_path(path); 6433 return ret; 6434} 6435 6436/* 6437 * there are some corner cases where we want to force a full 6438 * commit instead of allowing a directory to be logged. 6439 * 6440 * They revolve around files there were unlinked from the directory, and 6441 * this function updates the parent directory so that a full commit is 6442 * properly done if it is fsync'd later after the unlinks are done. 6443 * 6444 * Must be called before the unlink operations (updates to the subvolume tree, 6445 * inodes, etc) are done. 6446 */ 6447void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6448 struct btrfs_inode *dir, struct btrfs_inode *inode, 6449 int for_rename) 6450{ 6451 /* 6452 * when we're logging a file, if it hasn't been renamed 6453 * or unlinked, and its inode is fully committed on disk, 6454 * we don't have to worry about walking up the directory chain 6455 * to log its parents. 6456 * 6457 * So, we use the last_unlink_trans field to put this transid 6458 * into the file. When the file is logged we check it and 6459 * don't log the parents if the file is fully on disk. 6460 */ 6461 mutex_lock(&inode->log_mutex); 6462 inode->last_unlink_trans = trans->transid; 6463 mutex_unlock(&inode->log_mutex); 6464 6465 /* 6466 * if this directory was already logged any new 6467 * names for this file/dir will get recorded 6468 */ 6469 if (dir->logged_trans == trans->transid) 6470 return; 6471 6472 /* 6473 * if the inode we're about to unlink was logged, 6474 * the log will be properly updated for any new names 6475 */ 6476 if (inode->logged_trans == trans->transid) 6477 return; 6478 6479 /* 6480 * when renaming files across directories, if the directory 6481 * there we're unlinking from gets fsync'd later on, there's 6482 * no way to find the destination directory later and fsync it 6483 * properly. So, we have to be conservative and force commits 6484 * so the new name gets discovered. 6485 */ 6486 if (for_rename) 6487 goto record; 6488 6489 /* we can safely do the unlink without any special recording */ 6490 return; 6491 6492record: 6493 mutex_lock(&dir->log_mutex); 6494 dir->last_unlink_trans = trans->transid; 6495 mutex_unlock(&dir->log_mutex); 6496} 6497 6498/* 6499 * Make sure that if someone attempts to fsync the parent directory of a deleted 6500 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6501 * that after replaying the log tree of the parent directory's root we will not 6502 * see the snapshot anymore and at log replay time we will not see any log tree 6503 * corresponding to the deleted snapshot's root, which could lead to replaying 6504 * it after replaying the log tree of the parent directory (which would replay 6505 * the snapshot delete operation). 6506 * 6507 * Must be called before the actual snapshot destroy operation (updates to the 6508 * parent root and tree of tree roots trees, etc) are done. 6509 */ 6510void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6511 struct btrfs_inode *dir) 6512{ 6513 mutex_lock(&dir->log_mutex); 6514 dir->last_unlink_trans = trans->transid; 6515 mutex_unlock(&dir->log_mutex); 6516} 6517 6518/* 6519 * Call this after adding a new name for a file and it will properly 6520 * update the log to reflect the new name. 6521 */ 6522void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6523 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6524 struct dentry *parent) 6525{ 6526 struct btrfs_log_ctx ctx; 6527 6528 /* 6529 * this will force the logging code to walk the dentry chain 6530 * up for the file 6531 */ 6532 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6533 inode->last_unlink_trans = trans->transid; 6534 6535 /* 6536 * if this inode hasn't been logged and directory we're renaming it 6537 * from hasn't been logged, we don't need to log it 6538 */ 6539 if (!inode_logged(trans, inode) && 6540 (!old_dir || !inode_logged(trans, old_dir))) 6541 return; 6542 6543 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6544 ctx.logging_new_name = true; 6545 /* 6546 * We don't care about the return value. If we fail to log the new name 6547 * then we know the next attempt to sync the log will fallback to a full 6548 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 6549 * we don't need to worry about getting a log committed that has an 6550 * inconsistent state after a rename operation. 6551 */ 6552 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6553} 6554 6555