1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6#include <linux/sched.h> 7#include <linux/pagemap.h> 8#include <linux/writeback.h> 9#include <linux/blkdev.h> 10#include <linux/slab.h> 11#include <linux/workqueue.h> 12#include "ctree.h" 13#include "volumes.h" 14#include "disk-io.h" 15#include "transaction.h" 16#include "dev-replace.h" 17#include "block-group.h" 18 19#undef DEBUG 20 21/* 22 * This is the implementation for the generic read ahead framework. 23 * 24 * To trigger a readahead, btrfs_reada_add must be called. It will start 25 * a read ahead for the given range [start, end) on tree root. The returned 26 * handle can either be used to wait on the readahead to finish 27 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). 28 * 29 * The read ahead works as follows: 30 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. 31 * reada_start_machine will then search for extents to prefetch and trigger 32 * some reads. When a read finishes for a node, all contained node/leaf 33 * pointers that lie in the given range will also be enqueued. The reads will 34 * be triggered in sequential order, thus giving a big win over a naive 35 * enumeration. It will also make use of multi-device layouts. Each disk 36 * will have its on read pointer and all disks will by utilized in parallel. 37 * Also will no two disks read both sides of a mirror simultaneously, as this 38 * would waste seeking capacity. Instead both disks will read different parts 39 * of the filesystem. 40 * Any number of readaheads can be started in parallel. The read order will be 41 * determined globally, i.e. 2 parallel readaheads will normally finish faster 42 * than the 2 started one after another. 43 */ 44 45#define MAX_IN_FLIGHT 6 46 47struct reada_extctl { 48 struct list_head list; 49 struct reada_control *rc; 50 u64 generation; 51}; 52 53struct reada_extent { 54 u64 logical; 55 struct btrfs_key top; 56 struct list_head extctl; 57 int refcnt; 58 spinlock_t lock; 59 struct reada_zone *zones[BTRFS_MAX_MIRRORS]; 60 int nzones; 61 int scheduled; 62}; 63 64struct reada_zone { 65 u64 start; 66 u64 end; 67 u64 elems; 68 struct list_head list; 69 spinlock_t lock; 70 int locked; 71 struct btrfs_device *device; 72 struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl 73 * self */ 74 int ndevs; 75 struct kref refcnt; 76}; 77 78struct reada_machine_work { 79 struct btrfs_work work; 80 struct btrfs_fs_info *fs_info; 81}; 82 83static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); 84static void reada_control_release(struct kref *kref); 85static void reada_zone_release(struct kref *kref); 86static void reada_start_machine(struct btrfs_fs_info *fs_info); 87static void __reada_start_machine(struct btrfs_fs_info *fs_info); 88 89static int reada_add_block(struct reada_control *rc, u64 logical, 90 struct btrfs_key *top, u64 generation); 91 92/* recurses */ 93/* in case of err, eb might be NULL */ 94static void __readahead_hook(struct btrfs_fs_info *fs_info, 95 struct reada_extent *re, struct extent_buffer *eb, 96 int err) 97{ 98 int nritems; 99 int i; 100 u64 bytenr; 101 u64 generation; 102 struct list_head list; 103 104 spin_lock(&re->lock); 105 /* 106 * just take the full list from the extent. afterwards we 107 * don't need the lock anymore 108 */ 109 list_replace_init(&re->extctl, &list); 110 re->scheduled = 0; 111 spin_unlock(&re->lock); 112 113 /* 114 * this is the error case, the extent buffer has not been 115 * read correctly. We won't access anything from it and 116 * just cleanup our data structures. Effectively this will 117 * cut the branch below this node from read ahead. 118 */ 119 if (err) 120 goto cleanup; 121 122 /* 123 * FIXME: currently we just set nritems to 0 if this is a leaf, 124 * effectively ignoring the content. In a next step we could 125 * trigger more readahead depending from the content, e.g. 126 * fetch the checksums for the extents in the leaf. 127 */ 128 if (!btrfs_header_level(eb)) 129 goto cleanup; 130 131 nritems = btrfs_header_nritems(eb); 132 generation = btrfs_header_generation(eb); 133 for (i = 0; i < nritems; i++) { 134 struct reada_extctl *rec; 135 u64 n_gen; 136 struct btrfs_key key; 137 struct btrfs_key next_key; 138 139 btrfs_node_key_to_cpu(eb, &key, i); 140 if (i + 1 < nritems) 141 btrfs_node_key_to_cpu(eb, &next_key, i + 1); 142 else 143 next_key = re->top; 144 bytenr = btrfs_node_blockptr(eb, i); 145 n_gen = btrfs_node_ptr_generation(eb, i); 146 147 list_for_each_entry(rec, &list, list) { 148 struct reada_control *rc = rec->rc; 149 150 /* 151 * if the generation doesn't match, just ignore this 152 * extctl. This will probably cut off a branch from 153 * prefetch. Alternatively one could start a new (sub-) 154 * prefetch for this branch, starting again from root. 155 * FIXME: move the generation check out of this loop 156 */ 157#ifdef DEBUG 158 if (rec->generation != generation) { 159 btrfs_debug(fs_info, 160 "generation mismatch for (%llu,%d,%llu) %llu != %llu", 161 key.objectid, key.type, key.offset, 162 rec->generation, generation); 163 } 164#endif 165 if (rec->generation == generation && 166 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && 167 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) 168 reada_add_block(rc, bytenr, &next_key, n_gen); 169 } 170 } 171 172cleanup: 173 /* 174 * free extctl records 175 */ 176 while (!list_empty(&list)) { 177 struct reada_control *rc; 178 struct reada_extctl *rec; 179 180 rec = list_first_entry(&list, struct reada_extctl, list); 181 list_del(&rec->list); 182 rc = rec->rc; 183 kfree(rec); 184 185 kref_get(&rc->refcnt); 186 if (atomic_dec_and_test(&rc->elems)) { 187 kref_put(&rc->refcnt, reada_control_release); 188 wake_up(&rc->wait); 189 } 190 kref_put(&rc->refcnt, reada_control_release); 191 192 reada_extent_put(fs_info, re); /* one ref for each entry */ 193 } 194 195 return; 196} 197 198int btree_readahead_hook(struct extent_buffer *eb, int err) 199{ 200 struct btrfs_fs_info *fs_info = eb->fs_info; 201 int ret = 0; 202 struct reada_extent *re; 203 204 /* find extent */ 205 spin_lock(&fs_info->reada_lock); 206 re = radix_tree_lookup(&fs_info->reada_tree, 207 eb->start >> PAGE_SHIFT); 208 if (re) 209 re->refcnt++; 210 spin_unlock(&fs_info->reada_lock); 211 if (!re) { 212 ret = -1; 213 goto start_machine; 214 } 215 216 __readahead_hook(fs_info, re, eb, err); 217 reada_extent_put(fs_info, re); /* our ref */ 218 219start_machine: 220 reada_start_machine(fs_info); 221 return ret; 222} 223 224static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, 225 struct btrfs_bio *bbio) 226{ 227 struct btrfs_fs_info *fs_info = dev->fs_info; 228 int ret; 229 struct reada_zone *zone; 230 struct btrfs_block_group *cache = NULL; 231 u64 start; 232 u64 end; 233 int i; 234 235 zone = NULL; 236 spin_lock(&fs_info->reada_lock); 237 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 238 logical >> PAGE_SHIFT, 1); 239 if (ret == 1 && logical >= zone->start && logical <= zone->end) { 240 kref_get(&zone->refcnt); 241 spin_unlock(&fs_info->reada_lock); 242 return zone; 243 } 244 245 spin_unlock(&fs_info->reada_lock); 246 247 cache = btrfs_lookup_block_group(fs_info, logical); 248 if (!cache) 249 return NULL; 250 251 start = cache->start; 252 end = start + cache->length - 1; 253 btrfs_put_block_group(cache); 254 255 zone = kzalloc(sizeof(*zone), GFP_KERNEL); 256 if (!zone) 257 return NULL; 258 259 ret = radix_tree_preload(GFP_KERNEL); 260 if (ret) { 261 kfree(zone); 262 return NULL; 263 } 264 265 zone->start = start; 266 zone->end = end; 267 INIT_LIST_HEAD(&zone->list); 268 spin_lock_init(&zone->lock); 269 zone->locked = 0; 270 kref_init(&zone->refcnt); 271 zone->elems = 0; 272 zone->device = dev; /* our device always sits at index 0 */ 273 for (i = 0; i < bbio->num_stripes; ++i) { 274 /* bounds have already been checked */ 275 zone->devs[i] = bbio->stripes[i].dev; 276 } 277 zone->ndevs = bbio->num_stripes; 278 279 spin_lock(&fs_info->reada_lock); 280 ret = radix_tree_insert(&dev->reada_zones, 281 (unsigned long)(zone->end >> PAGE_SHIFT), 282 zone); 283 284 if (ret == -EEXIST) { 285 kfree(zone); 286 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 287 logical >> PAGE_SHIFT, 1); 288 if (ret == 1 && logical >= zone->start && logical <= zone->end) 289 kref_get(&zone->refcnt); 290 else 291 zone = NULL; 292 } 293 spin_unlock(&fs_info->reada_lock); 294 radix_tree_preload_end(); 295 296 return zone; 297} 298 299static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, 300 u64 logical, 301 struct btrfs_key *top) 302{ 303 int ret; 304 struct reada_extent *re = NULL; 305 struct reada_extent *re_exist = NULL; 306 struct btrfs_bio *bbio = NULL; 307 struct btrfs_device *dev; 308 struct btrfs_device *prev_dev; 309 u64 length; 310 int real_stripes; 311 int nzones = 0; 312 unsigned long index = logical >> PAGE_SHIFT; 313 int dev_replace_is_ongoing; 314 int have_zone = 0; 315 316 spin_lock(&fs_info->reada_lock); 317 re = radix_tree_lookup(&fs_info->reada_tree, index); 318 if (re) 319 re->refcnt++; 320 spin_unlock(&fs_info->reada_lock); 321 322 if (re) 323 return re; 324 325 re = kzalloc(sizeof(*re), GFP_KERNEL); 326 if (!re) 327 return NULL; 328 329 re->logical = logical; 330 re->top = *top; 331 INIT_LIST_HEAD(&re->extctl); 332 spin_lock_init(&re->lock); 333 re->refcnt = 1; 334 335 /* 336 * map block 337 */ 338 length = fs_info->nodesize; 339 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 340 &length, &bbio, 0); 341 if (ret || !bbio || length < fs_info->nodesize) 342 goto error; 343 344 if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { 345 btrfs_err(fs_info, 346 "readahead: more than %d copies not supported", 347 BTRFS_MAX_MIRRORS); 348 goto error; 349 } 350 351 real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 352 for (nzones = 0; nzones < real_stripes; ++nzones) { 353 struct reada_zone *zone; 354 355 dev = bbio->stripes[nzones].dev; 356 357 /* cannot read ahead on missing device. */ 358 if (!dev->bdev) 359 continue; 360 361 zone = reada_find_zone(dev, logical, bbio); 362 if (!zone) 363 continue; 364 365 re->zones[re->nzones++] = zone; 366 spin_lock(&zone->lock); 367 if (!zone->elems) 368 kref_get(&zone->refcnt); 369 ++zone->elems; 370 spin_unlock(&zone->lock); 371 spin_lock(&fs_info->reada_lock); 372 kref_put(&zone->refcnt, reada_zone_release); 373 spin_unlock(&fs_info->reada_lock); 374 } 375 if (re->nzones == 0) { 376 /* not a single zone found, error and out */ 377 goto error; 378 } 379 380 /* Insert extent in reada tree + all per-device trees, all or nothing */ 381 down_read(&fs_info->dev_replace.rwsem); 382 ret = radix_tree_preload(GFP_KERNEL); 383 if (ret) { 384 up_read(&fs_info->dev_replace.rwsem); 385 goto error; 386 } 387 388 spin_lock(&fs_info->reada_lock); 389 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 390 if (ret == -EEXIST) { 391 re_exist = radix_tree_lookup(&fs_info->reada_tree, index); 392 re_exist->refcnt++; 393 spin_unlock(&fs_info->reada_lock); 394 radix_tree_preload_end(); 395 up_read(&fs_info->dev_replace.rwsem); 396 goto error; 397 } 398 if (ret) { 399 spin_unlock(&fs_info->reada_lock); 400 radix_tree_preload_end(); 401 up_read(&fs_info->dev_replace.rwsem); 402 goto error; 403 } 404 radix_tree_preload_end(); 405 prev_dev = NULL; 406 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( 407 &fs_info->dev_replace); 408 for (nzones = 0; nzones < re->nzones; ++nzones) { 409 dev = re->zones[nzones]->device; 410 411 if (dev == prev_dev) { 412 /* 413 * in case of DUP, just add the first zone. As both 414 * are on the same device, there's nothing to gain 415 * from adding both. 416 * Also, it wouldn't work, as the tree is per device 417 * and adding would fail with EEXIST 418 */ 419 continue; 420 } 421 if (!dev->bdev) 422 continue; 423 424 if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) 425 continue; 426 427 if (dev_replace_is_ongoing && 428 dev == fs_info->dev_replace.tgtdev) { 429 /* 430 * as this device is selected for reading only as 431 * a last resort, skip it for read ahead. 432 */ 433 continue; 434 } 435 prev_dev = dev; 436 ret = radix_tree_insert(&dev->reada_extents, index, re); 437 if (ret) { 438 while (--nzones >= 0) { 439 dev = re->zones[nzones]->device; 440 BUG_ON(dev == NULL); 441 /* ignore whether the entry was inserted */ 442 radix_tree_delete(&dev->reada_extents, index); 443 } 444 radix_tree_delete(&fs_info->reada_tree, index); 445 spin_unlock(&fs_info->reada_lock); 446 up_read(&fs_info->dev_replace.rwsem); 447 goto error; 448 } 449 have_zone = 1; 450 } 451 if (!have_zone) 452 radix_tree_delete(&fs_info->reada_tree, index); 453 spin_unlock(&fs_info->reada_lock); 454 up_read(&fs_info->dev_replace.rwsem); 455 456 if (!have_zone) 457 goto error; 458 459 btrfs_put_bbio(bbio); 460 return re; 461 462error: 463 for (nzones = 0; nzones < re->nzones; ++nzones) { 464 struct reada_zone *zone; 465 466 zone = re->zones[nzones]; 467 kref_get(&zone->refcnt); 468 spin_lock(&zone->lock); 469 --zone->elems; 470 if (zone->elems == 0) { 471 /* 472 * no fs_info->reada_lock needed, as this can't be 473 * the last ref 474 */ 475 kref_put(&zone->refcnt, reada_zone_release); 476 } 477 spin_unlock(&zone->lock); 478 479 spin_lock(&fs_info->reada_lock); 480 kref_put(&zone->refcnt, reada_zone_release); 481 spin_unlock(&fs_info->reada_lock); 482 } 483 btrfs_put_bbio(bbio); 484 kfree(re); 485 return re_exist; 486} 487 488static void reada_extent_put(struct btrfs_fs_info *fs_info, 489 struct reada_extent *re) 490{ 491 int i; 492 unsigned long index = re->logical >> PAGE_SHIFT; 493 494 spin_lock(&fs_info->reada_lock); 495 if (--re->refcnt) { 496 spin_unlock(&fs_info->reada_lock); 497 return; 498 } 499 500 radix_tree_delete(&fs_info->reada_tree, index); 501 for (i = 0; i < re->nzones; ++i) { 502 struct reada_zone *zone = re->zones[i]; 503 504 radix_tree_delete(&zone->device->reada_extents, index); 505 } 506 507 spin_unlock(&fs_info->reada_lock); 508 509 for (i = 0; i < re->nzones; ++i) { 510 struct reada_zone *zone = re->zones[i]; 511 512 kref_get(&zone->refcnt); 513 spin_lock(&zone->lock); 514 --zone->elems; 515 if (zone->elems == 0) { 516 /* no fs_info->reada_lock needed, as this can't be 517 * the last ref */ 518 kref_put(&zone->refcnt, reada_zone_release); 519 } 520 spin_unlock(&zone->lock); 521 522 spin_lock(&fs_info->reada_lock); 523 kref_put(&zone->refcnt, reada_zone_release); 524 spin_unlock(&fs_info->reada_lock); 525 } 526 527 kfree(re); 528} 529 530static void reada_zone_release(struct kref *kref) 531{ 532 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); 533 534 radix_tree_delete(&zone->device->reada_zones, 535 zone->end >> PAGE_SHIFT); 536 537 kfree(zone); 538} 539 540static void reada_control_release(struct kref *kref) 541{ 542 struct reada_control *rc = container_of(kref, struct reada_control, 543 refcnt); 544 545 kfree(rc); 546} 547 548static int reada_add_block(struct reada_control *rc, u64 logical, 549 struct btrfs_key *top, u64 generation) 550{ 551 struct btrfs_fs_info *fs_info = rc->fs_info; 552 struct reada_extent *re; 553 struct reada_extctl *rec; 554 555 /* takes one ref */ 556 re = reada_find_extent(fs_info, logical, top); 557 if (!re) 558 return -1; 559 560 rec = kzalloc(sizeof(*rec), GFP_KERNEL); 561 if (!rec) { 562 reada_extent_put(fs_info, re); 563 return -ENOMEM; 564 } 565 566 rec->rc = rc; 567 rec->generation = generation; 568 atomic_inc(&rc->elems); 569 570 spin_lock(&re->lock); 571 list_add_tail(&rec->list, &re->extctl); 572 spin_unlock(&re->lock); 573 574 /* leave the ref on the extent */ 575 576 return 0; 577} 578 579/* 580 * called with fs_info->reada_lock held 581 */ 582static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) 583{ 584 int i; 585 unsigned long index = zone->end >> PAGE_SHIFT; 586 587 for (i = 0; i < zone->ndevs; ++i) { 588 struct reada_zone *peer; 589 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); 590 if (peer && peer->device != zone->device) 591 peer->locked = lock; 592 } 593} 594 595/* 596 * called with fs_info->reada_lock held 597 */ 598static int reada_pick_zone(struct btrfs_device *dev) 599{ 600 struct reada_zone *top_zone = NULL; 601 struct reada_zone *top_locked_zone = NULL; 602 u64 top_elems = 0; 603 u64 top_locked_elems = 0; 604 unsigned long index = 0; 605 int ret; 606 607 if (dev->reada_curr_zone) { 608 reada_peer_zones_set_lock(dev->reada_curr_zone, 0); 609 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); 610 dev->reada_curr_zone = NULL; 611 } 612 /* pick the zone with the most elements */ 613 while (1) { 614 struct reada_zone *zone; 615 616 ret = radix_tree_gang_lookup(&dev->reada_zones, 617 (void **)&zone, index, 1); 618 if (ret == 0) 619 break; 620 index = (zone->end >> PAGE_SHIFT) + 1; 621 if (zone->locked) { 622 if (zone->elems > top_locked_elems) { 623 top_locked_elems = zone->elems; 624 top_locked_zone = zone; 625 } 626 } else { 627 if (zone->elems > top_elems) { 628 top_elems = zone->elems; 629 top_zone = zone; 630 } 631 } 632 } 633 if (top_zone) 634 dev->reada_curr_zone = top_zone; 635 else if (top_locked_zone) 636 dev->reada_curr_zone = top_locked_zone; 637 else 638 return 0; 639 640 dev->reada_next = dev->reada_curr_zone->start; 641 kref_get(&dev->reada_curr_zone->refcnt); 642 reada_peer_zones_set_lock(dev->reada_curr_zone, 1); 643 644 return 1; 645} 646 647static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, 648 int mirror_num, struct extent_buffer **eb) 649{ 650 struct extent_buffer *buf = NULL; 651 int ret; 652 653 buf = btrfs_find_create_tree_block(fs_info, bytenr); 654 if (IS_ERR(buf)) 655 return 0; 656 657 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); 658 659 ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); 660 if (ret) { 661 free_extent_buffer_stale(buf); 662 return ret; 663 } 664 665 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { 666 free_extent_buffer_stale(buf); 667 return -EIO; 668 } else if (extent_buffer_uptodate(buf)) { 669 *eb = buf; 670 } else { 671 free_extent_buffer(buf); 672 } 673 return 0; 674} 675 676static int reada_start_machine_dev(struct btrfs_device *dev) 677{ 678 struct btrfs_fs_info *fs_info = dev->fs_info; 679 struct reada_extent *re = NULL; 680 int mirror_num = 0; 681 struct extent_buffer *eb = NULL; 682 u64 logical; 683 int ret; 684 int i; 685 686 spin_lock(&fs_info->reada_lock); 687 if (dev->reada_curr_zone == NULL) { 688 ret = reada_pick_zone(dev); 689 if (!ret) { 690 spin_unlock(&fs_info->reada_lock); 691 return 0; 692 } 693 } 694 /* 695 * FIXME currently we issue the reads one extent at a time. If we have 696 * a contiguous block of extents, we could also coagulate them or use 697 * plugging to speed things up 698 */ 699 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 700 dev->reada_next >> PAGE_SHIFT, 1); 701 if (ret == 0 || re->logical > dev->reada_curr_zone->end) { 702 ret = reada_pick_zone(dev); 703 if (!ret) { 704 spin_unlock(&fs_info->reada_lock); 705 return 0; 706 } 707 re = NULL; 708 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 709 dev->reada_next >> PAGE_SHIFT, 1); 710 } 711 if (ret == 0) { 712 spin_unlock(&fs_info->reada_lock); 713 return 0; 714 } 715 dev->reada_next = re->logical + fs_info->nodesize; 716 re->refcnt++; 717 718 spin_unlock(&fs_info->reada_lock); 719 720 spin_lock(&re->lock); 721 if (re->scheduled || list_empty(&re->extctl)) { 722 spin_unlock(&re->lock); 723 reada_extent_put(fs_info, re); 724 return 0; 725 } 726 re->scheduled = 1; 727 spin_unlock(&re->lock); 728 729 /* 730 * find mirror num 731 */ 732 for (i = 0; i < re->nzones; ++i) { 733 if (re->zones[i]->device == dev) { 734 mirror_num = i + 1; 735 break; 736 } 737 } 738 logical = re->logical; 739 740 atomic_inc(&dev->reada_in_flight); 741 ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb); 742 if (ret) 743 __readahead_hook(fs_info, re, NULL, ret); 744 else if (eb) 745 __readahead_hook(fs_info, re, eb, ret); 746 747 if (eb) 748 free_extent_buffer(eb); 749 750 atomic_dec(&dev->reada_in_flight); 751 reada_extent_put(fs_info, re); 752 753 return 1; 754 755} 756 757static void reada_start_machine_worker(struct btrfs_work *work) 758{ 759 struct reada_machine_work *rmw; 760 int old_ioprio; 761 762 rmw = container_of(work, struct reada_machine_work, work); 763 764 old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), 765 task_nice_ioprio(current)); 766 set_task_ioprio(current, BTRFS_IOPRIO_READA); 767 __reada_start_machine(rmw->fs_info); 768 set_task_ioprio(current, old_ioprio); 769 770 atomic_dec(&rmw->fs_info->reada_works_cnt); 771 772 kfree(rmw); 773} 774 775/* Try to start up to 10k READA requests for a group of devices */ 776static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) 777{ 778 u64 enqueued; 779 u64 total = 0; 780 struct btrfs_device *device; 781 782 do { 783 enqueued = 0; 784 list_for_each_entry(device, &fs_devices->devices, dev_list) { 785 if (atomic_read(&device->reada_in_flight) < 786 MAX_IN_FLIGHT) 787 enqueued += reada_start_machine_dev(device); 788 } 789 total += enqueued; 790 } while (enqueued && total < 10000); 791 792 return total; 793} 794 795static void __reada_start_machine(struct btrfs_fs_info *fs_info) 796{ 797 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 798 int i; 799 u64 enqueued = 0; 800 801 mutex_lock(&fs_devices->device_list_mutex); 802 803 enqueued += reada_start_for_fsdevs(fs_devices); 804 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) 805 enqueued += reada_start_for_fsdevs(seed_devs); 806 807 mutex_unlock(&fs_devices->device_list_mutex); 808 if (enqueued == 0) 809 return; 810 811 /* 812 * If everything is already in the cache, this is effectively single 813 * threaded. To a) not hold the caller for too long and b) to utilize 814 * more cores, we broke the loop above after 10000 iterations and now 815 * enqueue to workers to finish it. This will distribute the load to 816 * the cores. 817 */ 818 for (i = 0; i < 2; ++i) { 819 reada_start_machine(fs_info); 820 if (atomic_read(&fs_info->reada_works_cnt) > 821 BTRFS_MAX_MIRRORS * 2) 822 break; 823 } 824} 825 826static void reada_start_machine(struct btrfs_fs_info *fs_info) 827{ 828 struct reada_machine_work *rmw; 829 830 rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); 831 if (!rmw) { 832 /* FIXME we cannot handle this properly right now */ 833 BUG(); 834 } 835 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); 836 rmw->fs_info = fs_info; 837 838 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 839 atomic_inc(&fs_info->reada_works_cnt); 840} 841 842#ifdef DEBUG 843static void dump_devs(struct btrfs_fs_info *fs_info, int all) 844{ 845 struct btrfs_device *device; 846 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 847 unsigned long index; 848 int ret; 849 int i; 850 int j; 851 int cnt; 852 853 spin_lock(&fs_info->reada_lock); 854 list_for_each_entry(device, &fs_devices->devices, dev_list) { 855 btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid, 856 atomic_read(&device->reada_in_flight)); 857 index = 0; 858 while (1) { 859 struct reada_zone *zone; 860 ret = radix_tree_gang_lookup(&device->reada_zones, 861 (void **)&zone, index, 1); 862 if (ret == 0) 863 break; 864 pr_debug(" zone %llu-%llu elems %llu locked %d devs", 865 zone->start, zone->end, zone->elems, 866 zone->locked); 867 for (j = 0; j < zone->ndevs; ++j) { 868 pr_cont(" %lld", 869 zone->devs[j]->devid); 870 } 871 if (device->reada_curr_zone == zone) 872 pr_cont(" curr off %llu", 873 device->reada_next - zone->start); 874 pr_cont("\n"); 875 index = (zone->end >> PAGE_SHIFT) + 1; 876 } 877 cnt = 0; 878 index = 0; 879 while (all) { 880 struct reada_extent *re = NULL; 881 882 ret = radix_tree_gang_lookup(&device->reada_extents, 883 (void **)&re, index, 1); 884 if (ret == 0) 885 break; 886 pr_debug(" re: logical %llu size %u empty %d scheduled %d", 887 re->logical, fs_info->nodesize, 888 list_empty(&re->extctl), re->scheduled); 889 890 for (i = 0; i < re->nzones; ++i) { 891 pr_cont(" zone %llu-%llu devs", 892 re->zones[i]->start, 893 re->zones[i]->end); 894 for (j = 0; j < re->zones[i]->ndevs; ++j) { 895 pr_cont(" %lld", 896 re->zones[i]->devs[j]->devid); 897 } 898 } 899 pr_cont("\n"); 900 index = (re->logical >> PAGE_SHIFT) + 1; 901 if (++cnt > 15) 902 break; 903 } 904 } 905 906 index = 0; 907 cnt = 0; 908 while (all) { 909 struct reada_extent *re = NULL; 910 911 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, 912 index, 1); 913 if (ret == 0) 914 break; 915 if (!re->scheduled) { 916 index = (re->logical >> PAGE_SHIFT) + 1; 917 continue; 918 } 919 pr_debug("re: logical %llu size %u list empty %d scheduled %d", 920 re->logical, fs_info->nodesize, 921 list_empty(&re->extctl), re->scheduled); 922 for (i = 0; i < re->nzones; ++i) { 923 pr_cont(" zone %llu-%llu devs", 924 re->zones[i]->start, 925 re->zones[i]->end); 926 for (j = 0; j < re->zones[i]->ndevs; ++j) { 927 pr_cont(" %lld", 928 re->zones[i]->devs[j]->devid); 929 } 930 } 931 pr_cont("\n"); 932 index = (re->logical >> PAGE_SHIFT) + 1; 933 } 934 spin_unlock(&fs_info->reada_lock); 935} 936#endif 937 938/* 939 * interface 940 */ 941struct reada_control *btrfs_reada_add(struct btrfs_root *root, 942 struct btrfs_key *key_start, struct btrfs_key *key_end) 943{ 944 struct reada_control *rc; 945 u64 start; 946 u64 generation; 947 int ret; 948 struct extent_buffer *node; 949 static struct btrfs_key max_key = { 950 .objectid = (u64)-1, 951 .type = (u8)-1, 952 .offset = (u64)-1 953 }; 954 955 rc = kzalloc(sizeof(*rc), GFP_KERNEL); 956 if (!rc) 957 return ERR_PTR(-ENOMEM); 958 959 rc->fs_info = root->fs_info; 960 rc->key_start = *key_start; 961 rc->key_end = *key_end; 962 atomic_set(&rc->elems, 0); 963 init_waitqueue_head(&rc->wait); 964 kref_init(&rc->refcnt); 965 kref_get(&rc->refcnt); /* one ref for having elements */ 966 967 node = btrfs_root_node(root); 968 start = node->start; 969 generation = btrfs_header_generation(node); 970 free_extent_buffer(node); 971 972 ret = reada_add_block(rc, start, &max_key, generation); 973 if (ret) { 974 kfree(rc); 975 return ERR_PTR(ret); 976 } 977 978 reada_start_machine(root->fs_info); 979 980 return rc; 981} 982 983#ifdef DEBUG 984int btrfs_reada_wait(void *handle) 985{ 986 struct reada_control *rc = handle; 987 struct btrfs_fs_info *fs_info = rc->fs_info; 988 989 while (atomic_read(&rc->elems)) { 990 if (!atomic_read(&fs_info->reada_works_cnt)) 991 reada_start_machine(fs_info); 992 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 993 5 * HZ); 994 dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); 995 } 996 997 dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); 998 999 kref_put(&rc->refcnt, reada_control_release); 1000 1001 return 0; 1002} 1003#else 1004int btrfs_reada_wait(void *handle) 1005{ 1006 struct reada_control *rc = handle; 1007 struct btrfs_fs_info *fs_info = rc->fs_info; 1008 1009 while (atomic_read(&rc->elems)) { 1010 if (!atomic_read(&fs_info->reada_works_cnt)) 1011 reada_start_machine(fs_info); 1012 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 1013 (HZ + 9) / 10); 1014 } 1015 1016 kref_put(&rc->refcnt, reada_control_release); 1017 1018 return 0; 1019} 1020#endif 1021 1022void btrfs_reada_detach(void *handle) 1023{ 1024 struct reada_control *rc = handle; 1025 1026 kref_put(&rc->refcnt, reada_control_release); 1027} 1028 1029/* 1030 * Before removing a device (device replace or device remove ioctls), call this 1031 * function to wait for all existing readahead requests on the device and to 1032 * make sure no one queues more readahead requests for the device. 1033 * 1034 * Must be called without holding neither the device list mutex nor the device 1035 * replace semaphore, otherwise it will deadlock. 1036 */ 1037void btrfs_reada_remove_dev(struct btrfs_device *dev) 1038{ 1039 struct btrfs_fs_info *fs_info = dev->fs_info; 1040 1041 /* Serialize with readahead extent creation at reada_find_extent(). */ 1042 spin_lock(&fs_info->reada_lock); 1043 set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); 1044 spin_unlock(&fs_info->reada_lock); 1045 1046 /* 1047 * There might be readahead requests added to the radix trees which 1048 * were not yet added to the readahead work queue. We need to start 1049 * them and wait for their completion, otherwise we can end up with 1050 * use-after-free problems when dropping the last reference on the 1051 * readahead extents and their zones, as they need to access the 1052 * device structure. 1053 */ 1054 reada_start_machine(fs_info); 1055 btrfs_flush_workqueue(fs_info->readahead_workers); 1056} 1057 1058/* 1059 * If when removing a device (device replace or device remove ioctls) an error 1060 * happens after calling btrfs_reada_remove_dev(), call this to undo what that 1061 * function did. This is safe to call even if btrfs_reada_remove_dev() was not 1062 * called before. 1063 */ 1064void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) 1065{ 1066 spin_lock(&dev->fs_info->reada_lock); 1067 clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); 1068 spin_unlock(&dev->fs_info->reada_lock); 1069} 1070