1/* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7#include "dm-btree-internal.h" 8#include "dm-space-map.h" 9#include "dm-transaction-manager.h" 10 11#include <linux/export.h> 12#include <linux/device-mapper.h> 13 14#define DM_MSG_PREFIX "btree" 15 16/*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21{ 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24} 25 26static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29{ 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36} 37 38/*----------------------------------------------------------------*/ 39 40/* makes the assumption that no two keys are the same. */ 41static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 42{ 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59} 60 61int lower_bound(struct btree_node *n, uint64_t key) 62{ 63 return bsearch(n, key, 0); 64} 65 66static int upper_bound(struct btree_node *n, uint64_t key) 67{ 68 return bsearch(n, key, 1); 69} 70 71void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 72 struct dm_btree_value_type *vt) 73{ 74 unsigned i; 75 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 76 77 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 78 for (i = 0; i < nr_entries; i++) 79 dm_tm_inc(tm, value64(n, i)); 80 else if (vt->inc) 81 for (i = 0; i < nr_entries; i++) 82 vt->inc(vt->context, value_ptr(n, i)); 83} 84 85static int insert_at(size_t value_size, struct btree_node *node, unsigned index, 86 uint64_t key, void *value) 87 __dm_written_to_disk(value) 88{ 89 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 90 uint32_t max_entries = le32_to_cpu(node->header.max_entries); 91 __le64 key_le = cpu_to_le64(key); 92 93 if (index > nr_entries || 94 index >= max_entries || 95 nr_entries >= max_entries) { 96 DMERR("too many entries in btree node for insert"); 97 __dm_unbless_for_disk(value); 98 return -ENOMEM; 99 } 100 101 __dm_bless_for_disk(&key_le); 102 103 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 104 array_insert(value_base(node), value_size, nr_entries, index, value); 105 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 106 107 return 0; 108} 109 110/*----------------------------------------------------------------*/ 111 112/* 113 * We want 3n entries (for some n). This works more nicely for repeated 114 * insert remove loops than (2n + 1). 115 */ 116static uint32_t calc_max_entries(size_t value_size, size_t block_size) 117{ 118 uint32_t total, n; 119 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 120 121 block_size -= sizeof(struct node_header); 122 total = block_size / elt_size; 123 n = total / 3; /* rounds down */ 124 125 return 3 * n; 126} 127 128int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 129{ 130 int r; 131 struct dm_block *b; 132 struct btree_node *n; 133 size_t block_size; 134 uint32_t max_entries; 135 136 r = new_block(info, &b); 137 if (r < 0) 138 return r; 139 140 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 141 max_entries = calc_max_entries(info->value_type.size, block_size); 142 143 n = dm_block_data(b); 144 memset(n, 0, block_size); 145 n->header.flags = cpu_to_le32(LEAF_NODE); 146 n->header.nr_entries = cpu_to_le32(0); 147 n->header.max_entries = cpu_to_le32(max_entries); 148 n->header.value_size = cpu_to_le32(info->value_type.size); 149 150 *root = dm_block_location(b); 151 unlock_block(info, b); 152 153 return 0; 154} 155EXPORT_SYMBOL_GPL(dm_btree_empty); 156 157/*----------------------------------------------------------------*/ 158 159/* 160 * Deletion uses a recursive algorithm, since we have limited stack space 161 * we explicitly manage our own stack on the heap. 162 */ 163#define MAX_SPINE_DEPTH 64 164struct frame { 165 struct dm_block *b; 166 struct btree_node *n; 167 unsigned level; 168 unsigned nr_children; 169 unsigned current_child; 170}; 171 172struct del_stack { 173 struct dm_btree_info *info; 174 struct dm_transaction_manager *tm; 175 int top; 176 struct frame spine[MAX_SPINE_DEPTH]; 177}; 178 179static int top_frame(struct del_stack *s, struct frame **f) 180{ 181 if (s->top < 0) { 182 DMERR("btree deletion stack empty"); 183 return -EINVAL; 184 } 185 186 *f = s->spine + s->top; 187 188 return 0; 189} 190 191static int unprocessed_frames(struct del_stack *s) 192{ 193 return s->top >= 0; 194} 195 196static void prefetch_children(struct del_stack *s, struct frame *f) 197{ 198 unsigned i; 199 struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 200 201 for (i = 0; i < f->nr_children; i++) 202 dm_bm_prefetch(bm, value64(f->n, i)); 203} 204 205static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 206{ 207 return f->level < (info->levels - 1); 208} 209 210static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 211{ 212 int r; 213 uint32_t ref_count; 214 215 if (s->top >= MAX_SPINE_DEPTH - 1) { 216 DMERR("btree deletion stack out of memory"); 217 return -ENOMEM; 218 } 219 220 r = dm_tm_ref(s->tm, b, &ref_count); 221 if (r) 222 return r; 223 224 if (ref_count > 1) 225 /* 226 * This is a shared node, so we can just decrement it's 227 * reference counter and leave the children. 228 */ 229 dm_tm_dec(s->tm, b); 230 231 else { 232 uint32_t flags; 233 struct frame *f = s->spine + ++s->top; 234 235 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 236 if (r) { 237 s->top--; 238 return r; 239 } 240 241 f->n = dm_block_data(f->b); 242 f->level = level; 243 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 244 f->current_child = 0; 245 246 flags = le32_to_cpu(f->n->header.flags); 247 if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 248 prefetch_children(s, f); 249 } 250 251 return 0; 252} 253 254static void pop_frame(struct del_stack *s) 255{ 256 struct frame *f = s->spine + s->top--; 257 258 dm_tm_dec(s->tm, dm_block_location(f->b)); 259 dm_tm_unlock(s->tm, f->b); 260} 261 262static void unlock_all_frames(struct del_stack *s) 263{ 264 struct frame *f; 265 266 while (unprocessed_frames(s)) { 267 f = s->spine + s->top--; 268 dm_tm_unlock(s->tm, f->b); 269 } 270} 271 272int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 273{ 274 int r; 275 struct del_stack *s; 276 277 /* 278 * dm_btree_del() is called via an ioctl, as such should be 279 * considered an FS op. We can't recurse back into the FS, so we 280 * allocate GFP_NOFS. 281 */ 282 s = kmalloc(sizeof(*s), GFP_NOFS); 283 if (!s) 284 return -ENOMEM; 285 s->info = info; 286 s->tm = info->tm; 287 s->top = -1; 288 289 r = push_frame(s, root, 0); 290 if (r) 291 goto out; 292 293 while (unprocessed_frames(s)) { 294 uint32_t flags; 295 struct frame *f; 296 dm_block_t b; 297 298 r = top_frame(s, &f); 299 if (r) 300 goto out; 301 302 if (f->current_child >= f->nr_children) { 303 pop_frame(s); 304 continue; 305 } 306 307 flags = le32_to_cpu(f->n->header.flags); 308 if (flags & INTERNAL_NODE) { 309 b = value64(f->n, f->current_child); 310 f->current_child++; 311 r = push_frame(s, b, f->level); 312 if (r) 313 goto out; 314 315 } else if (is_internal_level(info, f)) { 316 b = value64(f->n, f->current_child); 317 f->current_child++; 318 r = push_frame(s, b, f->level + 1); 319 if (r) 320 goto out; 321 322 } else { 323 if (info->value_type.dec) { 324 unsigned i; 325 326 for (i = 0; i < f->nr_children; i++) 327 info->value_type.dec(info->value_type.context, 328 value_ptr(f->n, i)); 329 } 330 pop_frame(s); 331 } 332 } 333out: 334 if (r) { 335 /* cleanup all frames of del_stack */ 336 unlock_all_frames(s); 337 } 338 kfree(s); 339 340 return r; 341} 342EXPORT_SYMBOL_GPL(dm_btree_del); 343 344/*----------------------------------------------------------------*/ 345 346static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 347 int (*search_fn)(struct btree_node *, uint64_t), 348 uint64_t *result_key, void *v, size_t value_size) 349{ 350 int i, r; 351 uint32_t flags, nr_entries; 352 353 do { 354 r = ro_step(s, block); 355 if (r < 0) 356 return r; 357 358 i = search_fn(ro_node(s), key); 359 360 flags = le32_to_cpu(ro_node(s)->header.flags); 361 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 362 if (i < 0 || i >= nr_entries) 363 return -ENODATA; 364 365 if (flags & INTERNAL_NODE) 366 block = value64(ro_node(s), i); 367 368 } while (!(flags & LEAF_NODE)); 369 370 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 371 if (v) 372 memcpy(v, value_ptr(ro_node(s), i), value_size); 373 374 return 0; 375} 376 377int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 378 uint64_t *keys, void *value_le) 379{ 380 unsigned level, last_level = info->levels - 1; 381 int r = -ENODATA; 382 uint64_t rkey; 383 __le64 internal_value_le; 384 struct ro_spine spine; 385 386 init_ro_spine(&spine, info); 387 for (level = 0; level < info->levels; level++) { 388 size_t size; 389 void *value_p; 390 391 if (level == last_level) { 392 value_p = value_le; 393 size = info->value_type.size; 394 395 } else { 396 value_p = &internal_value_le; 397 size = sizeof(uint64_t); 398 } 399 400 r = btree_lookup_raw(&spine, root, keys[level], 401 lower_bound, &rkey, 402 value_p, size); 403 404 if (!r) { 405 if (rkey != keys[level]) { 406 exit_ro_spine(&spine); 407 return -ENODATA; 408 } 409 } else { 410 exit_ro_spine(&spine); 411 return r; 412 } 413 414 root = le64_to_cpu(internal_value_le); 415 } 416 exit_ro_spine(&spine); 417 418 return r; 419} 420EXPORT_SYMBOL_GPL(dm_btree_lookup); 421 422static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 423 uint64_t key, uint64_t *rkey, void *value_le) 424{ 425 int r, i; 426 uint32_t flags, nr_entries; 427 struct dm_block *node; 428 struct btree_node *n; 429 430 r = bn_read_lock(info, root, &node); 431 if (r) 432 return r; 433 434 n = dm_block_data(node); 435 flags = le32_to_cpu(n->header.flags); 436 nr_entries = le32_to_cpu(n->header.nr_entries); 437 438 if (flags & INTERNAL_NODE) { 439 i = lower_bound(n, key); 440 if (i < 0) { 441 /* 442 * avoid early -ENODATA return when all entries are 443 * higher than the search @key. 444 */ 445 i = 0; 446 } 447 if (i >= nr_entries) { 448 r = -ENODATA; 449 goto out; 450 } 451 452 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 453 if (r == -ENODATA && i < (nr_entries - 1)) { 454 i++; 455 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 456 } 457 458 } else { 459 i = upper_bound(n, key); 460 if (i < 0 || i >= nr_entries) { 461 r = -ENODATA; 462 goto out; 463 } 464 465 *rkey = le64_to_cpu(n->keys[i]); 466 memcpy(value_le, value_ptr(n, i), info->value_type.size); 467 } 468out: 469 dm_tm_unlock(info->tm, node); 470 return r; 471} 472 473int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 474 uint64_t *keys, uint64_t *rkey, void *value_le) 475{ 476 unsigned level; 477 int r = -ENODATA; 478 __le64 internal_value_le; 479 struct ro_spine spine; 480 481 init_ro_spine(&spine, info); 482 for (level = 0; level < info->levels - 1u; level++) { 483 r = btree_lookup_raw(&spine, root, keys[level], 484 lower_bound, rkey, 485 &internal_value_le, sizeof(uint64_t)); 486 if (r) 487 goto out; 488 489 if (*rkey != keys[level]) { 490 r = -ENODATA; 491 goto out; 492 } 493 494 root = le64_to_cpu(internal_value_le); 495 } 496 497 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 498out: 499 exit_ro_spine(&spine); 500 return r; 501} 502 503EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 504 505/* 506 * Splits a node by creating a sibling node and shifting half the nodes 507 * contents across. Assumes there is a parent node, and it has room for 508 * another child. 509 * 510 * Before: 511 * +--------+ 512 * | Parent | 513 * +--------+ 514 * | 515 * v 516 * +----------+ 517 * | A ++++++ | 518 * +----------+ 519 * 520 * 521 * After: 522 * +--------+ 523 * | Parent | 524 * +--------+ 525 * | | 526 * v +------+ 527 * +---------+ | 528 * | A* +++ | v 529 * +---------+ +-------+ 530 * | B +++ | 531 * +-------+ 532 * 533 * Where A* is a shadow of A. 534 */ 535static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, 536 uint64_t key) 537{ 538 int r; 539 size_t size; 540 unsigned nr_left, nr_right; 541 struct dm_block *left, *right, *parent; 542 struct btree_node *ln, *rn, *pn; 543 __le64 location; 544 545 left = shadow_current(s); 546 547 r = new_block(s->info, &right); 548 if (r < 0) 549 return r; 550 551 ln = dm_block_data(left); 552 rn = dm_block_data(right); 553 554 nr_left = le32_to_cpu(ln->header.nr_entries) / 2; 555 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; 556 557 ln->header.nr_entries = cpu_to_le32(nr_left); 558 559 rn->header.flags = ln->header.flags; 560 rn->header.nr_entries = cpu_to_le32(nr_right); 561 rn->header.max_entries = ln->header.max_entries; 562 rn->header.value_size = ln->header.value_size; 563 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); 564 565 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 566 sizeof(uint64_t) : s->info->value_type.size; 567 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), 568 size * nr_right); 569 570 /* 571 * Patch up the parent 572 */ 573 parent = shadow_parent(s); 574 575 pn = dm_block_data(parent); 576 location = cpu_to_le64(dm_block_location(left)); 577 __dm_bless_for_disk(&location); 578 memcpy_disk(value_ptr(pn, parent_index), 579 &location, sizeof(__le64)); 580 581 location = cpu_to_le64(dm_block_location(right)); 582 __dm_bless_for_disk(&location); 583 584 r = insert_at(sizeof(__le64), pn, parent_index + 1, 585 le64_to_cpu(rn->keys[0]), &location); 586 if (r) { 587 unlock_block(s->info, right); 588 return r; 589 } 590 591 if (key < le64_to_cpu(rn->keys[0])) { 592 unlock_block(s->info, right); 593 s->nodes[1] = left; 594 } else { 595 unlock_block(s->info, left); 596 s->nodes[1] = right; 597 } 598 599 return 0; 600} 601 602/* 603 * Splits a node by creating two new children beneath the given node. 604 * 605 * Before: 606 * +----------+ 607 * | A ++++++ | 608 * +----------+ 609 * 610 * 611 * After: 612 * +------------+ 613 * | A (shadow) | 614 * +------------+ 615 * | | 616 * +------+ +----+ 617 * | | 618 * v v 619 * +-------+ +-------+ 620 * | B +++ | | C +++ | 621 * +-------+ +-------+ 622 */ 623static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 624{ 625 int r; 626 size_t size; 627 unsigned nr_left, nr_right; 628 struct dm_block *left, *right, *new_parent; 629 struct btree_node *pn, *ln, *rn; 630 __le64 val; 631 632 new_parent = shadow_current(s); 633 634 pn = dm_block_data(new_parent); 635 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 636 sizeof(__le64) : s->info->value_type.size; 637 638 /* create & init the left block */ 639 r = new_block(s->info, &left); 640 if (r < 0) 641 return r; 642 643 ln = dm_block_data(left); 644 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 645 646 ln->header.flags = pn->header.flags; 647 ln->header.nr_entries = cpu_to_le32(nr_left); 648 ln->header.max_entries = pn->header.max_entries; 649 ln->header.value_size = pn->header.value_size; 650 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 651 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 652 653 /* create & init the right block */ 654 r = new_block(s->info, &right); 655 if (r < 0) { 656 unlock_block(s->info, left); 657 return r; 658 } 659 660 rn = dm_block_data(right); 661 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 662 663 rn->header.flags = pn->header.flags; 664 rn->header.nr_entries = cpu_to_le32(nr_right); 665 rn->header.max_entries = pn->header.max_entries; 666 rn->header.value_size = pn->header.value_size; 667 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 668 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 669 nr_right * size); 670 671 /* new_parent should just point to l and r now */ 672 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 673 pn->header.nr_entries = cpu_to_le32(2); 674 pn->header.max_entries = cpu_to_le32( 675 calc_max_entries(sizeof(__le64), 676 dm_bm_block_size( 677 dm_tm_get_bm(s->info->tm)))); 678 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 679 680 val = cpu_to_le64(dm_block_location(left)); 681 __dm_bless_for_disk(&val); 682 pn->keys[0] = ln->keys[0]; 683 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 684 685 val = cpu_to_le64(dm_block_location(right)); 686 __dm_bless_for_disk(&val); 687 pn->keys[1] = rn->keys[0]; 688 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 689 690 unlock_block(s->info, left); 691 unlock_block(s->info, right); 692 return 0; 693} 694 695static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 696 struct dm_btree_value_type *vt, 697 uint64_t key, unsigned *index) 698{ 699 int r, i = *index, top = 1; 700 struct btree_node *node; 701 702 for (;;) { 703 r = shadow_step(s, root, vt); 704 if (r < 0) 705 return r; 706 707 node = dm_block_data(shadow_current(s)); 708 709 /* 710 * We have to patch up the parent node, ugly, but I don't 711 * see a way to do this automatically as part of the spine 712 * op. 713 */ 714 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 715 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 716 717 __dm_bless_for_disk(&location); 718 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 719 &location, sizeof(__le64)); 720 } 721 722 node = dm_block_data(shadow_current(s)); 723 724 if (node->header.nr_entries == node->header.max_entries) { 725 if (top) 726 r = btree_split_beneath(s, key); 727 else 728 r = btree_split_sibling(s, i, key); 729 730 if (r < 0) 731 return r; 732 } 733 734 node = dm_block_data(shadow_current(s)); 735 736 i = lower_bound(node, key); 737 738 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 739 break; 740 741 if (i < 0) { 742 /* change the bounds on the lowest key */ 743 node->keys[0] = cpu_to_le64(key); 744 i = 0; 745 } 746 747 root = value64(node, i); 748 top = 0; 749 } 750 751 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 752 i++; 753 754 *index = i; 755 return 0; 756} 757 758static bool need_insert(struct btree_node *node, uint64_t *keys, 759 unsigned level, unsigned index) 760{ 761 return ((index >= le32_to_cpu(node->header.nr_entries)) || 762 (le64_to_cpu(node->keys[index]) != keys[level])); 763} 764 765static int insert(struct dm_btree_info *info, dm_block_t root, 766 uint64_t *keys, void *value, dm_block_t *new_root, 767 int *inserted) 768 __dm_written_to_disk(value) 769{ 770 int r; 771 unsigned level, index = -1, last_level = info->levels - 1; 772 dm_block_t block = root; 773 struct shadow_spine spine; 774 struct btree_node *n; 775 struct dm_btree_value_type le64_type; 776 777 init_le64_type(info->tm, &le64_type); 778 init_shadow_spine(&spine, info); 779 780 for (level = 0; level < (info->levels - 1); level++) { 781 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 782 if (r < 0) 783 goto bad; 784 785 n = dm_block_data(shadow_current(&spine)); 786 787 if (need_insert(n, keys, level, index)) { 788 dm_block_t new_tree; 789 __le64 new_le; 790 791 r = dm_btree_empty(info, &new_tree); 792 if (r < 0) 793 goto bad; 794 795 new_le = cpu_to_le64(new_tree); 796 __dm_bless_for_disk(&new_le); 797 798 r = insert_at(sizeof(uint64_t), n, index, 799 keys[level], &new_le); 800 if (r) 801 goto bad; 802 } 803 804 if (level < last_level) 805 block = value64(n, index); 806 } 807 808 r = btree_insert_raw(&spine, block, &info->value_type, 809 keys[level], &index); 810 if (r < 0) 811 goto bad; 812 813 n = dm_block_data(shadow_current(&spine)); 814 815 if (need_insert(n, keys, level, index)) { 816 if (inserted) 817 *inserted = 1; 818 819 r = insert_at(info->value_type.size, n, index, 820 keys[level], value); 821 if (r) 822 goto bad_unblessed; 823 } else { 824 if (inserted) 825 *inserted = 0; 826 827 if (info->value_type.dec && 828 (!info->value_type.equal || 829 !info->value_type.equal( 830 info->value_type.context, 831 value_ptr(n, index), 832 value))) { 833 info->value_type.dec(info->value_type.context, 834 value_ptr(n, index)); 835 } 836 memcpy_disk(value_ptr(n, index), 837 value, info->value_type.size); 838 } 839 840 *new_root = shadow_root(&spine); 841 exit_shadow_spine(&spine); 842 843 return 0; 844 845bad: 846 __dm_unbless_for_disk(value); 847bad_unblessed: 848 exit_shadow_spine(&spine); 849 return r; 850} 851 852int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 853 uint64_t *keys, void *value, dm_block_t *new_root) 854 __dm_written_to_disk(value) 855{ 856 return insert(info, root, keys, value, new_root, NULL); 857} 858EXPORT_SYMBOL_GPL(dm_btree_insert); 859 860int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 861 uint64_t *keys, void *value, dm_block_t *new_root, 862 int *inserted) 863 __dm_written_to_disk(value) 864{ 865 return insert(info, root, keys, value, new_root, inserted); 866} 867EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 868 869/*----------------------------------------------------------------*/ 870 871static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 872 uint64_t *result_key, dm_block_t *next_block) 873{ 874 int i, r; 875 uint32_t flags; 876 877 do { 878 r = ro_step(s, block); 879 if (r < 0) 880 return r; 881 882 flags = le32_to_cpu(ro_node(s)->header.flags); 883 i = le32_to_cpu(ro_node(s)->header.nr_entries); 884 if (!i) 885 return -ENODATA; 886 else 887 i--; 888 889 if (find_highest) 890 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 891 else 892 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 893 894 if (next_block || flags & INTERNAL_NODE) { 895 if (find_highest) 896 block = value64(ro_node(s), i); 897 else 898 block = value64(ro_node(s), 0); 899 } 900 901 } while (flags & INTERNAL_NODE); 902 903 if (next_block) 904 *next_block = block; 905 return 0; 906} 907 908static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 909 bool find_highest, uint64_t *result_keys) 910{ 911 int r = 0, count = 0, level; 912 struct ro_spine spine; 913 914 init_ro_spine(&spine, info); 915 for (level = 0; level < info->levels; level++) { 916 r = find_key(&spine, root, find_highest, result_keys + level, 917 level == info->levels - 1 ? NULL : &root); 918 if (r == -ENODATA) { 919 r = 0; 920 break; 921 922 } else if (r) 923 break; 924 925 count++; 926 } 927 exit_ro_spine(&spine); 928 929 return r ? r : count; 930} 931 932int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 933 uint64_t *result_keys) 934{ 935 return dm_btree_find_key(info, root, true, result_keys); 936} 937EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 938 939int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 940 uint64_t *result_keys) 941{ 942 return dm_btree_find_key(info, root, false, result_keys); 943} 944EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 945 946/*----------------------------------------------------------------*/ 947 948/* 949 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 950 * space. Also this only works for single level trees. 951 */ 952static int walk_node(struct dm_btree_info *info, dm_block_t block, 953 int (*fn)(void *context, uint64_t *keys, void *leaf), 954 void *context) 955{ 956 int r; 957 unsigned i, nr; 958 struct dm_block *node; 959 struct btree_node *n; 960 uint64_t keys; 961 962 r = bn_read_lock(info, block, &node); 963 if (r) 964 return r; 965 966 n = dm_block_data(node); 967 968 nr = le32_to_cpu(n->header.nr_entries); 969 for (i = 0; i < nr; i++) { 970 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 971 r = walk_node(info, value64(n, i), fn, context); 972 if (r) 973 goto out; 974 } else { 975 keys = le64_to_cpu(*key_ptr(n, i)); 976 r = fn(context, &keys, value_ptr(n, i)); 977 if (r) 978 goto out; 979 } 980 } 981 982out: 983 dm_tm_unlock(info->tm, node); 984 return r; 985} 986 987int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 988 int (*fn)(void *context, uint64_t *keys, void *leaf), 989 void *context) 990{ 991 BUG_ON(info->levels > 1); 992 return walk_node(info, root, fn, context); 993} 994EXPORT_SYMBOL_GPL(dm_btree_walk); 995 996/*----------------------------------------------------------------*/ 997 998static void prefetch_values(struct dm_btree_cursor *c) 999{ 1000 unsigned i, nr; 1001 __le64 value_le; 1002 struct cursor_node *n = c->nodes + c->depth - 1; 1003 struct btree_node *bn = dm_block_data(n->b); 1004 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 1005 1006 BUG_ON(c->info->value_type.size != sizeof(value_le)); 1007 1008 nr = le32_to_cpu(bn->header.nr_entries); 1009 for (i = 0; i < nr; i++) { 1010 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 1011 dm_bm_prefetch(bm, le64_to_cpu(value_le)); 1012 } 1013} 1014 1015static bool leaf_node(struct dm_btree_cursor *c) 1016{ 1017 struct cursor_node *n = c->nodes + c->depth - 1; 1018 struct btree_node *bn = dm_block_data(n->b); 1019 1020 return le32_to_cpu(bn->header.flags) & LEAF_NODE; 1021} 1022 1023static int push_node(struct dm_btree_cursor *c, dm_block_t b) 1024{ 1025 int r; 1026 struct cursor_node *n = c->nodes + c->depth; 1027 1028 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 1029 DMERR("couldn't push cursor node, stack depth too high"); 1030 return -EINVAL; 1031 } 1032 1033 r = bn_read_lock(c->info, b, &n->b); 1034 if (r) 1035 return r; 1036 1037 n->index = 0; 1038 c->depth++; 1039 1040 if (c->prefetch_leaves || !leaf_node(c)) 1041 prefetch_values(c); 1042 1043 return 0; 1044} 1045 1046static void pop_node(struct dm_btree_cursor *c) 1047{ 1048 c->depth--; 1049 unlock_block(c->info, c->nodes[c->depth].b); 1050} 1051 1052static int inc_or_backtrack(struct dm_btree_cursor *c) 1053{ 1054 struct cursor_node *n; 1055 struct btree_node *bn; 1056 1057 for (;;) { 1058 if (!c->depth) 1059 return -ENODATA; 1060 1061 n = c->nodes + c->depth - 1; 1062 bn = dm_block_data(n->b); 1063 1064 n->index++; 1065 if (n->index < le32_to_cpu(bn->header.nr_entries)) 1066 break; 1067 1068 pop_node(c); 1069 } 1070 1071 return 0; 1072} 1073 1074static int find_leaf(struct dm_btree_cursor *c) 1075{ 1076 int r = 0; 1077 struct cursor_node *n; 1078 struct btree_node *bn; 1079 __le64 value_le; 1080 1081 for (;;) { 1082 n = c->nodes + c->depth - 1; 1083 bn = dm_block_data(n->b); 1084 1085 if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 1086 break; 1087 1088 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 1089 r = push_node(c, le64_to_cpu(value_le)); 1090 if (r) { 1091 DMERR("push_node failed"); 1092 break; 1093 } 1094 } 1095 1096 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 1097 return -ENODATA; 1098 1099 return r; 1100} 1101 1102int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 1103 bool prefetch_leaves, struct dm_btree_cursor *c) 1104{ 1105 int r; 1106 1107 c->info = info; 1108 c->root = root; 1109 c->depth = 0; 1110 c->prefetch_leaves = prefetch_leaves; 1111 1112 r = push_node(c, root); 1113 if (r) 1114 return r; 1115 1116 return find_leaf(c); 1117} 1118EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 1119 1120void dm_btree_cursor_end(struct dm_btree_cursor *c) 1121{ 1122 while (c->depth) 1123 pop_node(c); 1124} 1125EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 1126 1127int dm_btree_cursor_next(struct dm_btree_cursor *c) 1128{ 1129 int r = inc_or_backtrack(c); 1130 if (!r) { 1131 r = find_leaf(c); 1132 if (r) 1133 DMERR("find_leaf failed"); 1134 } 1135 1136 return r; 1137} 1138EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1139 1140int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) 1141{ 1142 int r = 0; 1143 1144 while (count-- && !r) 1145 r = dm_btree_cursor_next(c); 1146 1147 return r; 1148} 1149EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); 1150 1151int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1152{ 1153 if (c->depth) { 1154 struct cursor_node *n = c->nodes + c->depth - 1; 1155 struct btree_node *bn = dm_block_data(n->b); 1156 1157 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 1158 return -EINVAL; 1159 1160 *key = le64_to_cpu(*key_ptr(bn, n->index)); 1161 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 1162 return 0; 1163 1164 } else 1165 return -ENODATA; 1166} 1167EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1168