1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6#include <linux/sched.h> 7#include <linux/slab.h> 8#include <linux/sort.h> 9#include "ctree.h" 10#include "delayed-ref.h" 11#include "transaction.h" 12#include "qgroup.h" 13#include "space-info.h" 14 15struct kmem_cache *btrfs_delayed_ref_head_cachep; 16struct kmem_cache *btrfs_delayed_tree_ref_cachep; 17struct kmem_cache *btrfs_delayed_data_ref_cachep; 18struct kmem_cache *btrfs_delayed_extent_op_cachep; 19/* 20 * delayed back reference update tracking. For subvolume trees 21 * we queue up extent allocations and backref maintenance for 22 * delayed processing. This avoids deep call chains where we 23 * add extents in the middle of btrfs_search_slot, and it allows 24 * us to buffer up frequently modified backrefs in an rb tree instead 25 * of hammering updates on the extent allocation tree. 26 */ 27 28bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 29{ 30 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 31 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 32 bool ret = false; 33 u64 reserved; 34 35 spin_lock(&global_rsv->lock); 36 reserved = global_rsv->reserved; 37 spin_unlock(&global_rsv->lock); 38 39 /* 40 * Since the global reserve is just kind of magic we don't really want 41 * to rely on it to save our bacon, so if our size is more than the 42 * delayed_refs_rsv and the global rsv then it's time to think about 43 * bailing. 44 */ 45 spin_lock(&delayed_refs_rsv->lock); 46 reserved += delayed_refs_rsv->reserved; 47 if (delayed_refs_rsv->size >= reserved) 48 ret = true; 49 spin_unlock(&delayed_refs_rsv->lock); 50 return ret; 51} 52 53int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 54{ 55 u64 num_entries = 56 atomic_read(&trans->transaction->delayed_refs.num_entries); 57 u64 avg_runtime; 58 u64 val; 59 60 smp_mb(); 61 avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 62 val = num_entries * avg_runtime; 63 if (val >= NSEC_PER_SEC) 64 return 1; 65 if (val >= NSEC_PER_SEC / 2) 66 return 2; 67 68 return btrfs_check_space_for_delayed_refs(trans->fs_info); 69} 70 71/** 72 * btrfs_delayed_refs_rsv_release - release a ref head's reservation. 73 * @fs_info - the fs_info for our fs. 74 * @nr - the number of items to drop. 75 * 76 * This drops the delayed ref head's count from the delayed refs rsv and frees 77 * any excess reservation we had. 78 */ 79void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 80{ 81 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 82 u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); 83 u64 released = 0; 84 85 released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 86 if (released) 87 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 88 0, released, 0); 89} 90 91/* 92 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 93 * @trans - the trans that may have generated delayed refs 94 * 95 * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 96 * it'll calculate the additional size and add it to the delayed_refs_rsv. 97 */ 98void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 99{ 100 struct btrfs_fs_info *fs_info = trans->fs_info; 101 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 102 u64 num_bytes; 103 104 if (!trans->delayed_ref_updates) 105 return; 106 107 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 108 trans->delayed_ref_updates); 109 spin_lock(&delayed_rsv->lock); 110 delayed_rsv->size += num_bytes; 111 delayed_rsv->full = 0; 112 spin_unlock(&delayed_rsv->lock); 113 trans->delayed_ref_updates = 0; 114} 115 116/** 117 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. 118 * @fs_info - the fs info for our fs. 119 * @src - the source block rsv to transfer from. 120 * @num_bytes - the number of bytes to transfer. 121 * 122 * This transfers up to the num_bytes amount from the src rsv to the 123 * delayed_refs_rsv. Any extra bytes are returned to the space info. 124 */ 125void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 126 struct btrfs_block_rsv *src, 127 u64 num_bytes) 128{ 129 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 130 u64 to_free = 0; 131 132 spin_lock(&src->lock); 133 src->reserved -= num_bytes; 134 src->size -= num_bytes; 135 spin_unlock(&src->lock); 136 137 spin_lock(&delayed_refs_rsv->lock); 138 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 139 u64 delta = delayed_refs_rsv->size - 140 delayed_refs_rsv->reserved; 141 if (num_bytes > delta) { 142 to_free = num_bytes - delta; 143 num_bytes = delta; 144 } 145 } else { 146 to_free = num_bytes; 147 num_bytes = 0; 148 } 149 150 if (num_bytes) 151 delayed_refs_rsv->reserved += num_bytes; 152 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 153 delayed_refs_rsv->full = 1; 154 spin_unlock(&delayed_refs_rsv->lock); 155 156 if (num_bytes) 157 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 158 0, num_bytes, 1); 159 if (to_free) 160 btrfs_space_info_free_bytes_may_use(fs_info, 161 delayed_refs_rsv->space_info, to_free); 162} 163 164/** 165 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. 166 * @fs_info - the fs_info for our fs. 167 * @flush - control how we can flush for this reservation. 168 * 169 * This will refill the delayed block_rsv up to 1 items size worth of space and 170 * will return -ENOSPC if we can't make the reservation. 171 */ 172int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 173 enum btrfs_reserve_flush_enum flush) 174{ 175 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 176 u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); 177 u64 num_bytes = 0; 178 int ret = -ENOSPC; 179 180 spin_lock(&block_rsv->lock); 181 if (block_rsv->reserved < block_rsv->size) { 182 num_bytes = block_rsv->size - block_rsv->reserved; 183 num_bytes = min(num_bytes, limit); 184 } 185 spin_unlock(&block_rsv->lock); 186 187 if (!num_bytes) 188 return 0; 189 190 ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, 191 num_bytes, flush); 192 if (ret) 193 return ret; 194 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); 195 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 196 0, num_bytes, 1); 197 return 0; 198} 199 200/* 201 * compare two delayed tree backrefs with same bytenr and type 202 */ 203static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, 204 struct btrfs_delayed_tree_ref *ref2) 205{ 206 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { 207 if (ref1->root < ref2->root) 208 return -1; 209 if (ref1->root > ref2->root) 210 return 1; 211 } else { 212 if (ref1->parent < ref2->parent) 213 return -1; 214 if (ref1->parent > ref2->parent) 215 return 1; 216 } 217 return 0; 218} 219 220/* 221 * compare two delayed data backrefs with same bytenr and type 222 */ 223static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, 224 struct btrfs_delayed_data_ref *ref2) 225{ 226 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { 227 if (ref1->root < ref2->root) 228 return -1; 229 if (ref1->root > ref2->root) 230 return 1; 231 if (ref1->objectid < ref2->objectid) 232 return -1; 233 if (ref1->objectid > ref2->objectid) 234 return 1; 235 if (ref1->offset < ref2->offset) 236 return -1; 237 if (ref1->offset > ref2->offset) 238 return 1; 239 } else { 240 if (ref1->parent < ref2->parent) 241 return -1; 242 if (ref1->parent > ref2->parent) 243 return 1; 244 } 245 return 0; 246} 247 248static int comp_refs(struct btrfs_delayed_ref_node *ref1, 249 struct btrfs_delayed_ref_node *ref2, 250 bool check_seq) 251{ 252 int ret = 0; 253 254 if (ref1->type < ref2->type) 255 return -1; 256 if (ref1->type > ref2->type) 257 return 1; 258 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 259 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) 260 ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), 261 btrfs_delayed_node_to_tree_ref(ref2)); 262 else 263 ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), 264 btrfs_delayed_node_to_data_ref(ref2)); 265 if (ret) 266 return ret; 267 if (check_seq) { 268 if (ref1->seq < ref2->seq) 269 return -1; 270 if (ref1->seq > ref2->seq) 271 return 1; 272 } 273 return 0; 274} 275 276/* insert a new ref to head ref rbtree */ 277static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, 278 struct rb_node *node) 279{ 280 struct rb_node **p = &root->rb_root.rb_node; 281 struct rb_node *parent_node = NULL; 282 struct btrfs_delayed_ref_head *entry; 283 struct btrfs_delayed_ref_head *ins; 284 u64 bytenr; 285 bool leftmost = true; 286 287 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 288 bytenr = ins->bytenr; 289 while (*p) { 290 parent_node = *p; 291 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 292 href_node); 293 294 if (bytenr < entry->bytenr) { 295 p = &(*p)->rb_left; 296 } else if (bytenr > entry->bytenr) { 297 p = &(*p)->rb_right; 298 leftmost = false; 299 } else { 300 return entry; 301 } 302 } 303 304 rb_link_node(node, parent_node, p); 305 rb_insert_color_cached(node, root, leftmost); 306 return NULL; 307} 308 309static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, 310 struct btrfs_delayed_ref_node *ins) 311{ 312 struct rb_node **p = &root->rb_root.rb_node; 313 struct rb_node *node = &ins->ref_node; 314 struct rb_node *parent_node = NULL; 315 struct btrfs_delayed_ref_node *entry; 316 bool leftmost = true; 317 318 while (*p) { 319 int comp; 320 321 parent_node = *p; 322 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 323 ref_node); 324 comp = comp_refs(ins, entry, true); 325 if (comp < 0) { 326 p = &(*p)->rb_left; 327 } else if (comp > 0) { 328 p = &(*p)->rb_right; 329 leftmost = false; 330 } else { 331 return entry; 332 } 333 } 334 335 rb_link_node(node, parent_node, p); 336 rb_insert_color_cached(node, root, leftmost); 337 return NULL; 338} 339 340static struct btrfs_delayed_ref_head *find_first_ref_head( 341 struct btrfs_delayed_ref_root *dr) 342{ 343 struct rb_node *n; 344 struct btrfs_delayed_ref_head *entry; 345 346 n = rb_first_cached(&dr->href_root); 347 if (!n) 348 return NULL; 349 350 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 351 352 return entry; 353} 354 355/* 356 * Find a head entry based on bytenr. This returns the delayed ref head if it 357 * was able to find one, or NULL if nothing was in that spot. If return_bigger 358 * is given, the next bigger entry is returned if no exact match is found. 359 */ 360static struct btrfs_delayed_ref_head *find_ref_head( 361 struct btrfs_delayed_ref_root *dr, u64 bytenr, 362 bool return_bigger) 363{ 364 struct rb_root *root = &dr->href_root.rb_root; 365 struct rb_node *n; 366 struct btrfs_delayed_ref_head *entry; 367 368 n = root->rb_node; 369 entry = NULL; 370 while (n) { 371 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 372 373 if (bytenr < entry->bytenr) 374 n = n->rb_left; 375 else if (bytenr > entry->bytenr) 376 n = n->rb_right; 377 else 378 return entry; 379 } 380 if (entry && return_bigger) { 381 if (bytenr > entry->bytenr) { 382 n = rb_next(&entry->href_node); 383 if (!n) 384 return NULL; 385 entry = rb_entry(n, struct btrfs_delayed_ref_head, 386 href_node); 387 } 388 return entry; 389 } 390 return NULL; 391} 392 393int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, 394 struct btrfs_delayed_ref_head *head) 395{ 396 lockdep_assert_held(&delayed_refs->lock); 397 if (mutex_trylock(&head->mutex)) 398 return 0; 399 400 refcount_inc(&head->refs); 401 spin_unlock(&delayed_refs->lock); 402 403 mutex_lock(&head->mutex); 404 spin_lock(&delayed_refs->lock); 405 if (RB_EMPTY_NODE(&head->href_node)) { 406 mutex_unlock(&head->mutex); 407 btrfs_put_delayed_ref_head(head); 408 return -EAGAIN; 409 } 410 btrfs_put_delayed_ref_head(head); 411 return 0; 412} 413 414static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, 415 struct btrfs_delayed_ref_root *delayed_refs, 416 struct btrfs_delayed_ref_head *head, 417 struct btrfs_delayed_ref_node *ref) 418{ 419 lockdep_assert_held(&head->lock); 420 rb_erase_cached(&ref->ref_node, &head->ref_tree); 421 RB_CLEAR_NODE(&ref->ref_node); 422 if (!list_empty(&ref->add_list)) 423 list_del(&ref->add_list); 424 ref->in_tree = 0; 425 btrfs_put_delayed_ref(ref); 426 atomic_dec(&delayed_refs->num_entries); 427} 428 429static bool merge_ref(struct btrfs_trans_handle *trans, 430 struct btrfs_delayed_ref_root *delayed_refs, 431 struct btrfs_delayed_ref_head *head, 432 struct btrfs_delayed_ref_node *ref, 433 u64 seq) 434{ 435 struct btrfs_delayed_ref_node *next; 436 struct rb_node *node = rb_next(&ref->ref_node); 437 bool done = false; 438 439 while (!done && node) { 440 int mod; 441 442 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 443 node = rb_next(node); 444 if (seq && next->seq >= seq) 445 break; 446 if (comp_refs(ref, next, false)) 447 break; 448 449 if (ref->action == next->action) { 450 mod = next->ref_mod; 451 } else { 452 if (ref->ref_mod < next->ref_mod) { 453 swap(ref, next); 454 done = true; 455 } 456 mod = -next->ref_mod; 457 } 458 459 drop_delayed_ref(trans, delayed_refs, head, next); 460 ref->ref_mod += mod; 461 if (ref->ref_mod == 0) { 462 drop_delayed_ref(trans, delayed_refs, head, ref); 463 done = true; 464 } else { 465 /* 466 * Can't have multiples of the same ref on a tree block. 467 */ 468 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 469 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 470 } 471 } 472 473 return done; 474} 475 476void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, 477 struct btrfs_delayed_ref_root *delayed_refs, 478 struct btrfs_delayed_ref_head *head) 479{ 480 struct btrfs_fs_info *fs_info = trans->fs_info; 481 struct btrfs_delayed_ref_node *ref; 482 struct rb_node *node; 483 u64 seq = 0; 484 485 lockdep_assert_held(&head->lock); 486 487 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 488 return; 489 490 /* We don't have too many refs to merge for data. */ 491 if (head->is_data) 492 return; 493 494 read_lock(&fs_info->tree_mod_log_lock); 495 if (!list_empty(&fs_info->tree_mod_seq_list)) { 496 struct seq_list *elem; 497 498 elem = list_first_entry(&fs_info->tree_mod_seq_list, 499 struct seq_list, list); 500 seq = elem->seq; 501 } 502 read_unlock(&fs_info->tree_mod_log_lock); 503 504again: 505 for (node = rb_first_cached(&head->ref_tree); node; 506 node = rb_next(node)) { 507 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 508 if (seq && ref->seq >= seq) 509 continue; 510 if (merge_ref(trans, delayed_refs, head, ref, seq)) 511 goto again; 512 } 513} 514 515int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) 516{ 517 struct seq_list *elem; 518 int ret = 0; 519 520 read_lock(&fs_info->tree_mod_log_lock); 521 if (!list_empty(&fs_info->tree_mod_seq_list)) { 522 elem = list_first_entry(&fs_info->tree_mod_seq_list, 523 struct seq_list, list); 524 if (seq >= elem->seq) { 525 btrfs_debug(fs_info, 526 "holding back delayed_ref %#x.%x, lowest is %#x.%x", 527 (u32)(seq >> 32), (u32)seq, 528 (u32)(elem->seq >> 32), (u32)elem->seq); 529 ret = 1; 530 } 531 } 532 533 read_unlock(&fs_info->tree_mod_log_lock); 534 return ret; 535} 536 537struct btrfs_delayed_ref_head *btrfs_select_ref_head( 538 struct btrfs_delayed_ref_root *delayed_refs) 539{ 540 struct btrfs_delayed_ref_head *head; 541 542again: 543 head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 544 true); 545 if (!head && delayed_refs->run_delayed_start != 0) { 546 delayed_refs->run_delayed_start = 0; 547 head = find_first_ref_head(delayed_refs); 548 } 549 if (!head) 550 return NULL; 551 552 while (head->processing) { 553 struct rb_node *node; 554 555 node = rb_next(&head->href_node); 556 if (!node) { 557 if (delayed_refs->run_delayed_start == 0) 558 return NULL; 559 delayed_refs->run_delayed_start = 0; 560 goto again; 561 } 562 head = rb_entry(node, struct btrfs_delayed_ref_head, 563 href_node); 564 } 565 566 head->processing = 1; 567 WARN_ON(delayed_refs->num_heads_ready == 0); 568 delayed_refs->num_heads_ready--; 569 delayed_refs->run_delayed_start = head->bytenr + 570 head->num_bytes; 571 return head; 572} 573 574void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 575 struct btrfs_delayed_ref_head *head) 576{ 577 lockdep_assert_held(&delayed_refs->lock); 578 lockdep_assert_held(&head->lock); 579 580 rb_erase_cached(&head->href_node, &delayed_refs->href_root); 581 RB_CLEAR_NODE(&head->href_node); 582 atomic_dec(&delayed_refs->num_entries); 583 delayed_refs->num_heads--; 584 if (head->processing == 0) 585 delayed_refs->num_heads_ready--; 586} 587 588/* 589 * Helper to insert the ref_node to the tail or merge with tail. 590 * 591 * Return 0 for insert. 592 * Return >0 for merge. 593 */ 594static int insert_delayed_ref(struct btrfs_trans_handle *trans, 595 struct btrfs_delayed_ref_root *root, 596 struct btrfs_delayed_ref_head *href, 597 struct btrfs_delayed_ref_node *ref) 598{ 599 struct btrfs_delayed_ref_node *exist; 600 int mod; 601 int ret = 0; 602 603 spin_lock(&href->lock); 604 exist = tree_insert(&href->ref_tree, ref); 605 if (!exist) 606 goto inserted; 607 608 /* Now we are sure we can merge */ 609 ret = 1; 610 if (exist->action == ref->action) { 611 mod = ref->ref_mod; 612 } else { 613 /* Need to change action */ 614 if (exist->ref_mod < ref->ref_mod) { 615 exist->action = ref->action; 616 mod = -exist->ref_mod; 617 exist->ref_mod = ref->ref_mod; 618 if (ref->action == BTRFS_ADD_DELAYED_REF) 619 list_add_tail(&exist->add_list, 620 &href->ref_add_list); 621 else if (ref->action == BTRFS_DROP_DELAYED_REF) { 622 ASSERT(!list_empty(&exist->add_list)); 623 list_del(&exist->add_list); 624 } else { 625 ASSERT(0); 626 } 627 } else 628 mod = -ref->ref_mod; 629 } 630 exist->ref_mod += mod; 631 632 /* remove existing tail if its ref_mod is zero */ 633 if (exist->ref_mod == 0) 634 drop_delayed_ref(trans, root, href, exist); 635 spin_unlock(&href->lock); 636 return ret; 637inserted: 638 if (ref->action == BTRFS_ADD_DELAYED_REF) 639 list_add_tail(&ref->add_list, &href->ref_add_list); 640 atomic_inc(&root->num_entries); 641 spin_unlock(&href->lock); 642 return ret; 643} 644 645/* 646 * helper function to update the accounting in the head ref 647 * existing and update must have the same bytenr 648 */ 649static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, 650 struct btrfs_delayed_ref_head *existing, 651 struct btrfs_delayed_ref_head *update) 652{ 653 struct btrfs_delayed_ref_root *delayed_refs = 654 &trans->transaction->delayed_refs; 655 struct btrfs_fs_info *fs_info = trans->fs_info; 656 u64 flags = btrfs_ref_head_to_space_flags(existing); 657 int old_ref_mod; 658 659 BUG_ON(existing->is_data != update->is_data); 660 661 spin_lock(&existing->lock); 662 if (update->must_insert_reserved) { 663 /* if the extent was freed and then 664 * reallocated before the delayed ref 665 * entries were processed, we can end up 666 * with an existing head ref without 667 * the must_insert_reserved flag set. 668 * Set it again here 669 */ 670 existing->must_insert_reserved = update->must_insert_reserved; 671 672 /* 673 * update the num_bytes so we make sure the accounting 674 * is done correctly 675 */ 676 existing->num_bytes = update->num_bytes; 677 678 } 679 680 if (update->extent_op) { 681 if (!existing->extent_op) { 682 existing->extent_op = update->extent_op; 683 } else { 684 if (update->extent_op->update_key) { 685 memcpy(&existing->extent_op->key, 686 &update->extent_op->key, 687 sizeof(update->extent_op->key)); 688 existing->extent_op->update_key = true; 689 } 690 if (update->extent_op->update_flags) { 691 existing->extent_op->flags_to_set |= 692 update->extent_op->flags_to_set; 693 existing->extent_op->update_flags = true; 694 } 695 btrfs_free_delayed_extent_op(update->extent_op); 696 } 697 } 698 /* 699 * update the reference mod on the head to reflect this new operation, 700 * only need the lock for this case cause we could be processing it 701 * currently, for refs we just added we know we're a-ok. 702 */ 703 old_ref_mod = existing->total_ref_mod; 704 existing->ref_mod += update->ref_mod; 705 existing->total_ref_mod += update->ref_mod; 706 707 /* 708 * If we are going to from a positive ref mod to a negative or vice 709 * versa we need to make sure to adjust pending_csums accordingly. 710 */ 711 if (existing->is_data) { 712 u64 csum_leaves = 713 btrfs_csum_bytes_to_leaves(fs_info, 714 existing->num_bytes); 715 716 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { 717 delayed_refs->pending_csums -= existing->num_bytes; 718 btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); 719 } 720 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { 721 delayed_refs->pending_csums += existing->num_bytes; 722 trans->delayed_ref_updates += csum_leaves; 723 } 724 } 725 726 /* 727 * This handles the following conditions: 728 * 729 * 1. We had a ref mod of 0 or more and went negative, indicating that 730 * we may be freeing space, so add our space to the 731 * total_bytes_pinned counter. 732 * 2. We were negative and went to 0 or positive, so no longer can say 733 * that the space would be pinned, decrement our counter from the 734 * total_bytes_pinned counter. 735 * 3. We are now at 0 and have ->must_insert_reserved set, which means 736 * this was a new allocation and then we dropped it, and thus must 737 * add our space to the total_bytes_pinned counter. 738 */ 739 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) 740 btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); 741 else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) 742 btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); 743 else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) 744 btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); 745 746 spin_unlock(&existing->lock); 747} 748 749static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, 750 struct btrfs_qgroup_extent_record *qrecord, 751 u64 bytenr, u64 num_bytes, u64 ref_root, 752 u64 reserved, int action, bool is_data, 753 bool is_system) 754{ 755 int count_mod = 1; 756 int must_insert_reserved = 0; 757 758 /* If reserved is provided, it must be a data extent. */ 759 BUG_ON(!is_data && reserved); 760 761 /* 762 * The head node stores the sum of all the mods, so dropping a ref 763 * should drop the sum in the head node by one. 764 */ 765 if (action == BTRFS_UPDATE_DELAYED_HEAD) 766 count_mod = 0; 767 else if (action == BTRFS_DROP_DELAYED_REF) 768 count_mod = -1; 769 770 /* 771 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved 772 * accounting when the extent is finally added, or if a later 773 * modification deletes the delayed ref without ever inserting the 774 * extent into the extent allocation tree. ref->must_insert_reserved 775 * is the flag used to record that accounting mods are required. 776 * 777 * Once we record must_insert_reserved, switch the action to 778 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 779 */ 780 if (action == BTRFS_ADD_DELAYED_EXTENT) 781 must_insert_reserved = 1; 782 else 783 must_insert_reserved = 0; 784 785 refcount_set(&head_ref->refs, 1); 786 head_ref->bytenr = bytenr; 787 head_ref->num_bytes = num_bytes; 788 head_ref->ref_mod = count_mod; 789 head_ref->must_insert_reserved = must_insert_reserved; 790 head_ref->is_data = is_data; 791 head_ref->is_system = is_system; 792 head_ref->ref_tree = RB_ROOT_CACHED; 793 INIT_LIST_HEAD(&head_ref->ref_add_list); 794 RB_CLEAR_NODE(&head_ref->href_node); 795 head_ref->processing = 0; 796 head_ref->total_ref_mod = count_mod; 797 spin_lock_init(&head_ref->lock); 798 mutex_init(&head_ref->mutex); 799 800 if (qrecord) { 801 if (ref_root && reserved) { 802 qrecord->data_rsv = reserved; 803 qrecord->data_rsv_refroot = ref_root; 804 } 805 qrecord->bytenr = bytenr; 806 qrecord->num_bytes = num_bytes; 807 qrecord->old_roots = NULL; 808 } 809} 810 811/* 812 * helper function to actually insert a head node into the rbtree. 813 * this does all the dirty work in terms of maintaining the correct 814 * overall modification count. 815 */ 816static noinline struct btrfs_delayed_ref_head * 817add_delayed_ref_head(struct btrfs_trans_handle *trans, 818 struct btrfs_delayed_ref_head *head_ref, 819 struct btrfs_qgroup_extent_record *qrecord, 820 int action, int *qrecord_inserted_ret) 821{ 822 struct btrfs_delayed_ref_head *existing; 823 struct btrfs_delayed_ref_root *delayed_refs; 824 int qrecord_inserted = 0; 825 826 delayed_refs = &trans->transaction->delayed_refs; 827 828 /* Record qgroup extent info if provided */ 829 if (qrecord) { 830 if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, 831 delayed_refs, qrecord)) 832 kfree(qrecord); 833 else 834 qrecord_inserted = 1; 835 } 836 837 trace_add_delayed_ref_head(trans->fs_info, head_ref, action); 838 839 existing = htree_insert(&delayed_refs->href_root, 840 &head_ref->href_node); 841 if (existing) { 842 update_existing_head_ref(trans, existing, head_ref); 843 /* 844 * we've updated the existing ref, free the newly 845 * allocated ref 846 */ 847 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 848 head_ref = existing; 849 } else { 850 u64 flags = btrfs_ref_head_to_space_flags(head_ref); 851 852 if (head_ref->is_data && head_ref->ref_mod < 0) { 853 delayed_refs->pending_csums += head_ref->num_bytes; 854 trans->delayed_ref_updates += 855 btrfs_csum_bytes_to_leaves(trans->fs_info, 856 head_ref->num_bytes); 857 } 858 if (head_ref->ref_mod < 0) 859 btrfs_mod_total_bytes_pinned(trans->fs_info, flags, 860 head_ref->num_bytes); 861 delayed_refs->num_heads++; 862 delayed_refs->num_heads_ready++; 863 atomic_inc(&delayed_refs->num_entries); 864 trans->delayed_ref_updates++; 865 } 866 if (qrecord_inserted_ret) 867 *qrecord_inserted_ret = qrecord_inserted; 868 869 return head_ref; 870} 871 872/* 873 * init_delayed_ref_common - Initialize the structure which represents a 874 * modification to a an extent. 875 * 876 * @fs_info: Internal to the mounted filesystem mount structure. 877 * 878 * @ref: The structure which is going to be initialized. 879 * 880 * @bytenr: The logical address of the extent for which a modification is 881 * going to be recorded. 882 * 883 * @num_bytes: Size of the extent whose modification is being recorded. 884 * 885 * @ref_root: The id of the root where this modification has originated, this 886 * can be either one of the well-known metadata trees or the 887 * subvolume id which references this extent. 888 * 889 * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or 890 * BTRFS_ADD_DELAYED_EXTENT 891 * 892 * @ref_type: Holds the type of the extent which is being recorded, can be 893 * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY 894 * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ 895 * BTRFS_EXTENT_DATA_REF_KEY when recording data extent 896 */ 897static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, 898 struct btrfs_delayed_ref_node *ref, 899 u64 bytenr, u64 num_bytes, u64 ref_root, 900 int action, u8 ref_type) 901{ 902 u64 seq = 0; 903 904 if (action == BTRFS_ADD_DELAYED_EXTENT) 905 action = BTRFS_ADD_DELAYED_REF; 906 907 if (is_fstree(ref_root)) 908 seq = atomic64_read(&fs_info->tree_mod_seq); 909 910 refcount_set(&ref->refs, 1); 911 ref->bytenr = bytenr; 912 ref->num_bytes = num_bytes; 913 ref->ref_mod = 1; 914 ref->action = action; 915 ref->is_head = 0; 916 ref->in_tree = 1; 917 ref->seq = seq; 918 ref->type = ref_type; 919 RB_CLEAR_NODE(&ref->ref_node); 920 INIT_LIST_HEAD(&ref->add_list); 921} 922 923/* 924 * add a delayed tree ref. This does all of the accounting required 925 * to make sure the delayed ref is eventually processed before this 926 * transaction commits. 927 */ 928int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 929 struct btrfs_ref *generic_ref, 930 struct btrfs_delayed_extent_op *extent_op) 931{ 932 struct btrfs_fs_info *fs_info = trans->fs_info; 933 struct btrfs_delayed_tree_ref *ref; 934 struct btrfs_delayed_ref_head *head_ref; 935 struct btrfs_delayed_ref_root *delayed_refs; 936 struct btrfs_qgroup_extent_record *record = NULL; 937 int qrecord_inserted; 938 bool is_system; 939 int action = generic_ref->action; 940 int level = generic_ref->tree_ref.level; 941 int ret; 942 u64 bytenr = generic_ref->bytenr; 943 u64 num_bytes = generic_ref->len; 944 u64 parent = generic_ref->parent; 945 u8 ref_type; 946 947 is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID); 948 949 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 950 BUG_ON(extent_op && extent_op->is_data); 951 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 952 if (!ref) 953 return -ENOMEM; 954 955 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 956 if (!head_ref) { 957 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 958 return -ENOMEM; 959 } 960 961 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && 962 is_fstree(generic_ref->real_root) && 963 is_fstree(generic_ref->tree_ref.root) && 964 !generic_ref->skip_qgroup) { 965 record = kzalloc(sizeof(*record), GFP_NOFS); 966 if (!record) { 967 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 968 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 969 return -ENOMEM; 970 } 971 } 972 973 if (parent) 974 ref_type = BTRFS_SHARED_BLOCK_REF_KEY; 975 else 976 ref_type = BTRFS_TREE_BLOCK_REF_KEY; 977 978 init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, 979 generic_ref->tree_ref.root, action, ref_type); 980 ref->root = generic_ref->tree_ref.root; 981 ref->parent = parent; 982 ref->level = level; 983 984 init_delayed_ref_head(head_ref, record, bytenr, num_bytes, 985 generic_ref->tree_ref.root, 0, action, false, 986 is_system); 987 head_ref->extent_op = extent_op; 988 989 delayed_refs = &trans->transaction->delayed_refs; 990 spin_lock(&delayed_refs->lock); 991 992 /* 993 * insert both the head node and the new ref without dropping 994 * the spin lock 995 */ 996 head_ref = add_delayed_ref_head(trans, head_ref, record, 997 action, &qrecord_inserted); 998 999 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); 1000 spin_unlock(&delayed_refs->lock); 1001 1002 /* 1003 * Need to update the delayed_refs_rsv with any changes we may have 1004 * made. 1005 */ 1006 btrfs_update_delayed_refs_rsv(trans); 1007 1008 trace_add_delayed_tree_ref(fs_info, &ref->node, ref, 1009 action == BTRFS_ADD_DELAYED_EXTENT ? 1010 BTRFS_ADD_DELAYED_REF : action); 1011 if (ret > 0) 1012 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 1013 1014 if (qrecord_inserted) 1015 btrfs_qgroup_trace_extent_post(fs_info, record); 1016 1017 return 0; 1018} 1019 1020/* 1021 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 1022 */ 1023int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 1024 struct btrfs_ref *generic_ref, 1025 u64 reserved) 1026{ 1027 struct btrfs_fs_info *fs_info = trans->fs_info; 1028 struct btrfs_delayed_data_ref *ref; 1029 struct btrfs_delayed_ref_head *head_ref; 1030 struct btrfs_delayed_ref_root *delayed_refs; 1031 struct btrfs_qgroup_extent_record *record = NULL; 1032 int qrecord_inserted; 1033 int action = generic_ref->action; 1034 int ret; 1035 u64 bytenr = generic_ref->bytenr; 1036 u64 num_bytes = generic_ref->len; 1037 u64 parent = generic_ref->parent; 1038 u64 ref_root = generic_ref->data_ref.ref_root; 1039 u64 owner = generic_ref->data_ref.ino; 1040 u64 offset = generic_ref->data_ref.offset; 1041 u8 ref_type; 1042 1043 ASSERT(generic_ref->type == BTRFS_REF_DATA && action); 1044 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); 1045 if (!ref) 1046 return -ENOMEM; 1047 1048 if (parent) 1049 ref_type = BTRFS_SHARED_DATA_REF_KEY; 1050 else 1051 ref_type = BTRFS_EXTENT_DATA_REF_KEY; 1052 init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, 1053 ref_root, action, ref_type); 1054 ref->root = ref_root; 1055 ref->parent = parent; 1056 ref->objectid = owner; 1057 ref->offset = offset; 1058 1059 1060 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1061 if (!head_ref) { 1062 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1063 return -ENOMEM; 1064 } 1065 1066 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && 1067 is_fstree(ref_root) && 1068 is_fstree(generic_ref->real_root) && 1069 !generic_ref->skip_qgroup) { 1070 record = kzalloc(sizeof(*record), GFP_NOFS); 1071 if (!record) { 1072 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1073 kmem_cache_free(btrfs_delayed_ref_head_cachep, 1074 head_ref); 1075 return -ENOMEM; 1076 } 1077 } 1078 1079 init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, 1080 reserved, action, true, false); 1081 head_ref->extent_op = NULL; 1082 1083 delayed_refs = &trans->transaction->delayed_refs; 1084 spin_lock(&delayed_refs->lock); 1085 1086 /* 1087 * insert both the head node and the new ref without dropping 1088 * the spin lock 1089 */ 1090 head_ref = add_delayed_ref_head(trans, head_ref, record, 1091 action, &qrecord_inserted); 1092 1093 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); 1094 spin_unlock(&delayed_refs->lock); 1095 1096 /* 1097 * Need to update the delayed_refs_rsv with any changes we may have 1098 * made. 1099 */ 1100 btrfs_update_delayed_refs_rsv(trans); 1101 1102 trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, 1103 action == BTRFS_ADD_DELAYED_EXTENT ? 1104 BTRFS_ADD_DELAYED_REF : action); 1105 if (ret > 0) 1106 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1107 1108 1109 if (qrecord_inserted) 1110 return btrfs_qgroup_trace_extent_post(fs_info, record); 1111 return 0; 1112} 1113 1114int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 1115 u64 bytenr, u64 num_bytes, 1116 struct btrfs_delayed_extent_op *extent_op) 1117{ 1118 struct btrfs_delayed_ref_head *head_ref; 1119 struct btrfs_delayed_ref_root *delayed_refs; 1120 1121 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1122 if (!head_ref) 1123 return -ENOMEM; 1124 1125 init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, 1126 BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, 1127 false); 1128 head_ref->extent_op = extent_op; 1129 1130 delayed_refs = &trans->transaction->delayed_refs; 1131 spin_lock(&delayed_refs->lock); 1132 1133 add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, 1134 NULL); 1135 1136 spin_unlock(&delayed_refs->lock); 1137 1138 /* 1139 * Need to update the delayed_refs_rsv with any changes we may have 1140 * made. 1141 */ 1142 btrfs_update_delayed_refs_rsv(trans); 1143 return 0; 1144} 1145 1146/* 1147 * This does a simple search for the head node for a given extent. Returns the 1148 * head node if found, or NULL if not. 1149 */ 1150struct btrfs_delayed_ref_head * 1151btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1152{ 1153 lockdep_assert_held(&delayed_refs->lock); 1154 1155 return find_ref_head(delayed_refs, bytenr, false); 1156} 1157 1158void __cold btrfs_delayed_ref_exit(void) 1159{ 1160 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 1161 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 1162 kmem_cache_destroy(btrfs_delayed_data_ref_cachep); 1163 kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 1164} 1165 1166int __init btrfs_delayed_ref_init(void) 1167{ 1168 btrfs_delayed_ref_head_cachep = kmem_cache_create( 1169 "btrfs_delayed_ref_head", 1170 sizeof(struct btrfs_delayed_ref_head), 0, 1171 SLAB_MEM_SPREAD, NULL); 1172 if (!btrfs_delayed_ref_head_cachep) 1173 goto fail; 1174 1175 btrfs_delayed_tree_ref_cachep = kmem_cache_create( 1176 "btrfs_delayed_tree_ref", 1177 sizeof(struct btrfs_delayed_tree_ref), 0, 1178 SLAB_MEM_SPREAD, NULL); 1179 if (!btrfs_delayed_tree_ref_cachep) 1180 goto fail; 1181 1182 btrfs_delayed_data_ref_cachep = kmem_cache_create( 1183 "btrfs_delayed_data_ref", 1184 sizeof(struct btrfs_delayed_data_ref), 0, 1185 SLAB_MEM_SPREAD, NULL); 1186 if (!btrfs_delayed_data_ref_cachep) 1187 goto fail; 1188 1189 btrfs_delayed_extent_op_cachep = kmem_cache_create( 1190 "btrfs_delayed_extent_op", 1191 sizeof(struct btrfs_delayed_extent_op), 0, 1192 SLAB_MEM_SPREAD, NULL); 1193 if (!btrfs_delayed_extent_op_cachep) 1194 goto fail; 1195 1196 return 0; 1197fail: 1198 btrfs_delayed_ref_exit(); 1199 return -ENOMEM; 1200} 1201