1// SPDX-License-Identifier: GPL-2.0 2#include <linux/errno.h> 3#include <linux/numa.h> 4#include <linux/slab.h> 5#include <linux/rculist.h> 6#include <linux/threads.h> 7#include <linux/preempt.h> 8#include <linux/irqflags.h> 9#include <linux/vmalloc.h> 10#include <linux/mm.h> 11#include <linux/module.h> 12#include <linux/device-mapper.h> 13 14#include "dm-core.h" 15#include "dm-stats.h" 16 17#define DM_MSG_PREFIX "stats" 18 19static int dm_stat_need_rcu_barrier; 20 21/* 22 * Using 64-bit values to avoid overflow (which is a 23 * problem that block/genhd.c's IO accounting has). 24 */ 25struct dm_stat_percpu { 26 unsigned long long sectors[2]; 27 unsigned long long ios[2]; 28 unsigned long long merges[2]; 29 unsigned long long ticks[2]; 30 unsigned long long io_ticks[2]; 31 unsigned long long io_ticks_total; 32 unsigned long long time_in_queue; 33 unsigned long long *histogram; 34}; 35 36struct dm_stat_shared { 37 atomic_t in_flight[2]; 38 unsigned long long stamp; 39 struct dm_stat_percpu tmp; 40}; 41 42struct dm_stat { 43 struct list_head list_entry; 44 int id; 45 unsigned stat_flags; 46 size_t n_entries; 47 sector_t start; 48 sector_t end; 49 sector_t step; 50 unsigned n_histogram_entries; 51 unsigned long long *histogram_boundaries; 52 const char *program_id; 53 const char *aux_data; 54 struct rcu_head rcu_head; 55 size_t shared_alloc_size; 56 size_t percpu_alloc_size; 57 size_t histogram_alloc_size; 58 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 59 struct dm_stat_shared stat_shared[]; 60}; 61 62#define STAT_PRECISE_TIMESTAMPS 1 63 64struct dm_stats_last_position { 65 sector_t last_sector; 66 unsigned last_rw; 67}; 68 69/* 70 * A typo on the command line could possibly make the kernel run out of memory 71 * and crash. To prevent the crash we account all used memory. We fail if we 72 * exhaust 1/4 of all memory or 1/2 of vmalloc space. 73 */ 74#define DM_STATS_MEMORY_FACTOR 4 75#define DM_STATS_VMALLOC_FACTOR 2 76 77static DEFINE_SPINLOCK(shared_memory_lock); 78 79static unsigned long shared_memory_amount; 80 81static bool __check_shared_memory(size_t alloc_size) 82{ 83 size_t a; 84 85 a = shared_memory_amount + alloc_size; 86 if (a < shared_memory_amount) 87 return false; 88 if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) 89 return false; 90#ifdef CONFIG_MMU 91 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 92 return false; 93#endif 94 return true; 95} 96 97static bool check_shared_memory(size_t alloc_size) 98{ 99 bool ret; 100 101 spin_lock_irq(&shared_memory_lock); 102 103 ret = __check_shared_memory(alloc_size); 104 105 spin_unlock_irq(&shared_memory_lock); 106 107 return ret; 108} 109 110static bool claim_shared_memory(size_t alloc_size) 111{ 112 spin_lock_irq(&shared_memory_lock); 113 114 if (!__check_shared_memory(alloc_size)) { 115 spin_unlock_irq(&shared_memory_lock); 116 return false; 117 } 118 119 shared_memory_amount += alloc_size; 120 121 spin_unlock_irq(&shared_memory_lock); 122 123 return true; 124} 125 126static void free_shared_memory(size_t alloc_size) 127{ 128 unsigned long flags; 129 130 spin_lock_irqsave(&shared_memory_lock, flags); 131 132 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 133 spin_unlock_irqrestore(&shared_memory_lock, flags); 134 DMCRIT("Memory usage accounting bug."); 135 return; 136 } 137 138 shared_memory_amount -= alloc_size; 139 140 spin_unlock_irqrestore(&shared_memory_lock, flags); 141} 142 143static void *dm_kvzalloc(size_t alloc_size, int node) 144{ 145 void *p; 146 147 if (!claim_shared_memory(alloc_size)) 148 return NULL; 149 150 p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node); 151 if (p) 152 return p; 153 154 free_shared_memory(alloc_size); 155 156 return NULL; 157} 158 159static void dm_kvfree(void *ptr, size_t alloc_size) 160{ 161 if (!ptr) 162 return; 163 164 free_shared_memory(alloc_size); 165 166 kvfree(ptr); 167} 168 169static void dm_stat_free(struct rcu_head *head) 170{ 171 int cpu; 172 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 173 174 kfree(s->histogram_boundaries); 175 kfree(s->program_id); 176 kfree(s->aux_data); 177 for_each_possible_cpu(cpu) { 178 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 179 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 180 } 181 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 182 dm_kvfree(s, s->shared_alloc_size); 183} 184 185static int dm_stat_in_flight(struct dm_stat_shared *shared) 186{ 187 return atomic_read(&shared->in_flight[READ]) + 188 atomic_read(&shared->in_flight[WRITE]); 189} 190 191int dm_stats_init(struct dm_stats *stats) 192{ 193 int cpu; 194 struct dm_stats_last_position *last; 195 196 mutex_init(&stats->mutex); 197 INIT_LIST_HEAD(&stats->list); 198 stats->last = alloc_percpu(struct dm_stats_last_position); 199 if (!stats->last) 200 return -ENOMEM; 201 202 for_each_possible_cpu(cpu) { 203 last = per_cpu_ptr(stats->last, cpu); 204 last->last_sector = (sector_t)ULLONG_MAX; 205 last->last_rw = UINT_MAX; 206 } 207 208 return 0; 209} 210 211void dm_stats_cleanup(struct dm_stats *stats) 212{ 213 size_t ni; 214 struct dm_stat *s; 215 struct dm_stat_shared *shared; 216 217 while (!list_empty(&stats->list)) { 218 s = container_of(stats->list.next, struct dm_stat, list_entry); 219 list_del(&s->list_entry); 220 for (ni = 0; ni < s->n_entries; ni++) { 221 shared = &s->stat_shared[ni]; 222 if (WARN_ON(dm_stat_in_flight(shared))) { 223 DMCRIT("leaked in-flight counter at index %lu " 224 "(start %llu, end %llu, step %llu): reads %d, writes %d", 225 (unsigned long)ni, 226 (unsigned long long)s->start, 227 (unsigned long long)s->end, 228 (unsigned long long)s->step, 229 atomic_read(&shared->in_flight[READ]), 230 atomic_read(&shared->in_flight[WRITE])); 231 } 232 cond_resched(); 233 } 234 dm_stat_free(&s->rcu_head); 235 } 236 free_percpu(stats->last); 237 mutex_destroy(&stats->mutex); 238} 239 240static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 241 sector_t step, unsigned stat_flags, 242 unsigned n_histogram_entries, 243 unsigned long long *histogram_boundaries, 244 const char *program_id, const char *aux_data, 245 void (*suspend_callback)(struct mapped_device *), 246 void (*resume_callback)(struct mapped_device *), 247 struct mapped_device *md) 248{ 249 struct list_head *l; 250 struct dm_stat *s, *tmp_s; 251 sector_t n_entries; 252 size_t ni; 253 size_t shared_alloc_size; 254 size_t percpu_alloc_size; 255 size_t histogram_alloc_size; 256 struct dm_stat_percpu *p; 257 int cpu; 258 int ret_id; 259 int r; 260 261 if (end < start || !step) 262 return -EINVAL; 263 264 n_entries = end - start; 265 if (dm_sector_div64(n_entries, step)) 266 n_entries++; 267 268 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 269 return -EOVERFLOW; 270 271 shared_alloc_size = struct_size(s, stat_shared, n_entries); 272 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 273 return -EOVERFLOW; 274 275 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 276 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 277 return -EOVERFLOW; 278 279 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 280 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 281 return -EOVERFLOW; 282 283 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 284 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 285 return -ENOMEM; 286 287 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 288 if (!s) 289 return -ENOMEM; 290 291 s->stat_flags = stat_flags; 292 s->n_entries = n_entries; 293 s->start = start; 294 s->end = end; 295 s->step = step; 296 s->shared_alloc_size = shared_alloc_size; 297 s->percpu_alloc_size = percpu_alloc_size; 298 s->histogram_alloc_size = histogram_alloc_size; 299 300 s->n_histogram_entries = n_histogram_entries; 301 s->histogram_boundaries = kmemdup(histogram_boundaries, 302 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 303 if (!s->histogram_boundaries) { 304 r = -ENOMEM; 305 goto out; 306 } 307 308 s->program_id = kstrdup(program_id, GFP_KERNEL); 309 if (!s->program_id) { 310 r = -ENOMEM; 311 goto out; 312 } 313 s->aux_data = kstrdup(aux_data, GFP_KERNEL); 314 if (!s->aux_data) { 315 r = -ENOMEM; 316 goto out; 317 } 318 319 for (ni = 0; ni < n_entries; ni++) { 320 atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 321 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 322 cond_resched(); 323 } 324 325 if (s->n_histogram_entries) { 326 unsigned long long *hi; 327 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 328 if (!hi) { 329 r = -ENOMEM; 330 goto out; 331 } 332 for (ni = 0; ni < n_entries; ni++) { 333 s->stat_shared[ni].tmp.histogram = hi; 334 hi += s->n_histogram_entries + 1; 335 cond_resched(); 336 } 337 } 338 339 for_each_possible_cpu(cpu) { 340 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 341 if (!p) { 342 r = -ENOMEM; 343 goto out; 344 } 345 s->stat_percpu[cpu] = p; 346 if (s->n_histogram_entries) { 347 unsigned long long *hi; 348 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 349 if (!hi) { 350 r = -ENOMEM; 351 goto out; 352 } 353 for (ni = 0; ni < n_entries; ni++) { 354 p[ni].histogram = hi; 355 hi += s->n_histogram_entries + 1; 356 cond_resched(); 357 } 358 } 359 } 360 361 /* 362 * Suspend/resume to make sure there is no i/o in flight, 363 * so that newly created statistics will be exact. 364 * 365 * (note: we couldn't suspend earlier because we must not 366 * allocate memory while suspended) 367 */ 368 suspend_callback(md); 369 370 mutex_lock(&stats->mutex); 371 s->id = 0; 372 list_for_each(l, &stats->list) { 373 tmp_s = container_of(l, struct dm_stat, list_entry); 374 if (WARN_ON(tmp_s->id < s->id)) { 375 r = -EINVAL; 376 goto out_unlock_resume; 377 } 378 if (tmp_s->id > s->id) 379 break; 380 if (unlikely(s->id == INT_MAX)) { 381 r = -ENFILE; 382 goto out_unlock_resume; 383 } 384 s->id++; 385 } 386 ret_id = s->id; 387 list_add_tail_rcu(&s->list_entry, l); 388 mutex_unlock(&stats->mutex); 389 390 resume_callback(md); 391 392 return ret_id; 393 394out_unlock_resume: 395 mutex_unlock(&stats->mutex); 396 resume_callback(md); 397out: 398 dm_stat_free(&s->rcu_head); 399 return r; 400} 401 402static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 403{ 404 struct dm_stat *s; 405 406 list_for_each_entry(s, &stats->list, list_entry) { 407 if (s->id > id) 408 break; 409 if (s->id == id) 410 return s; 411 } 412 413 return NULL; 414} 415 416static int dm_stats_delete(struct dm_stats *stats, int id) 417{ 418 struct dm_stat *s; 419 int cpu; 420 421 mutex_lock(&stats->mutex); 422 423 s = __dm_stats_find(stats, id); 424 if (!s) { 425 mutex_unlock(&stats->mutex); 426 return -ENOENT; 427 } 428 429 list_del_rcu(&s->list_entry); 430 mutex_unlock(&stats->mutex); 431 432 /* 433 * vfree can't be called from RCU callback 434 */ 435 for_each_possible_cpu(cpu) 436 if (is_vmalloc_addr(s->stat_percpu) || 437 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 438 goto do_sync_free; 439 if (is_vmalloc_addr(s) || 440 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 441do_sync_free: 442 synchronize_rcu_expedited(); 443 dm_stat_free(&s->rcu_head); 444 } else { 445 WRITE_ONCE(dm_stat_need_rcu_barrier, 1); 446 call_rcu(&s->rcu_head, dm_stat_free); 447 } 448 return 0; 449} 450 451static int dm_stats_list(struct dm_stats *stats, const char *program, 452 char *result, unsigned maxlen) 453{ 454 struct dm_stat *s; 455 sector_t len; 456 unsigned sz = 0; 457 458 /* 459 * Output format: 460 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 461 */ 462 463 mutex_lock(&stats->mutex); 464 list_for_each_entry(s, &stats->list, list_entry) { 465 if (!program || !strcmp(program, s->program_id)) { 466 len = s->end - s->start; 467 DMEMIT("%d: %llu+%llu %llu %s %s", s->id, 468 (unsigned long long)s->start, 469 (unsigned long long)len, 470 (unsigned long long)s->step, 471 s->program_id, 472 s->aux_data); 473 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 474 DMEMIT(" precise_timestamps"); 475 if (s->n_histogram_entries) { 476 unsigned i; 477 DMEMIT(" histogram:"); 478 for (i = 0; i < s->n_histogram_entries; i++) { 479 if (i) 480 DMEMIT(","); 481 DMEMIT("%llu", s->histogram_boundaries[i]); 482 } 483 } 484 DMEMIT("\n"); 485 } 486 cond_resched(); 487 } 488 mutex_unlock(&stats->mutex); 489 490 return 1; 491} 492 493static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 494 struct dm_stat_percpu *p) 495{ 496 /* 497 * This is racy, but so is part_round_stats_single. 498 */ 499 unsigned long long now, difference; 500 unsigned in_flight_read, in_flight_write; 501 502 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 503 now = jiffies; 504 else 505 now = ktime_to_ns(ktime_get()); 506 507 difference = now - shared->stamp; 508 if (!difference) 509 return; 510 511 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 512 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 513 if (in_flight_read) 514 p->io_ticks[READ] += difference; 515 if (in_flight_write) 516 p->io_ticks[WRITE] += difference; 517 if (in_flight_read + in_flight_write) { 518 p->io_ticks_total += difference; 519 p->time_in_queue += (in_flight_read + in_flight_write) * difference; 520 } 521 shared->stamp = now; 522} 523 524static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 525 int idx, sector_t len, 526 struct dm_stats_aux *stats_aux, bool end, 527 unsigned long duration_jiffies) 528{ 529 struct dm_stat_shared *shared = &s->stat_shared[entry]; 530 struct dm_stat_percpu *p; 531 532 /* 533 * For strict correctness we should use local_irq_save/restore 534 * instead of preempt_disable/enable. 535 * 536 * preempt_disable/enable is racy if the driver finishes bios 537 * from non-interrupt context as well as from interrupt context 538 * or from more different interrupts. 539 * 540 * On 64-bit architectures the race only results in not counting some 541 * events, so it is acceptable. On 32-bit architectures the race could 542 * cause the counter going off by 2^32, so we need to do proper locking 543 * there. 544 * 545 * part_stat_lock()/part_stat_unlock() have this race too. 546 */ 547#if BITS_PER_LONG == 32 548 unsigned long flags; 549 local_irq_save(flags); 550#else 551 preempt_disable(); 552#endif 553 p = &s->stat_percpu[smp_processor_id()][entry]; 554 555 if (!end) { 556 dm_stat_round(s, shared, p); 557 atomic_inc(&shared->in_flight[idx]); 558 } else { 559 unsigned long long duration; 560 dm_stat_round(s, shared, p); 561 atomic_dec(&shared->in_flight[idx]); 562 p->sectors[idx] += len; 563 p->ios[idx] += 1; 564 p->merges[idx] += stats_aux->merged; 565 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 566 p->ticks[idx] += duration_jiffies; 567 duration = jiffies_to_msecs(duration_jiffies); 568 } else { 569 p->ticks[idx] += stats_aux->duration_ns; 570 duration = stats_aux->duration_ns; 571 } 572 if (s->n_histogram_entries) { 573 unsigned lo = 0, hi = s->n_histogram_entries + 1; 574 while (lo + 1 < hi) { 575 unsigned mid = (lo + hi) / 2; 576 if (s->histogram_boundaries[mid - 1] > duration) { 577 hi = mid; 578 } else { 579 lo = mid; 580 } 581 582 } 583 p->histogram[lo]++; 584 } 585 } 586 587#if BITS_PER_LONG == 32 588 local_irq_restore(flags); 589#else 590 preempt_enable(); 591#endif 592} 593 594static void __dm_stat_bio(struct dm_stat *s, int bi_rw, 595 sector_t bi_sector, sector_t end_sector, 596 bool end, unsigned long duration_jiffies, 597 struct dm_stats_aux *stats_aux) 598{ 599 sector_t rel_sector, offset, todo, fragment_len; 600 size_t entry; 601 602 if (end_sector <= s->start || bi_sector >= s->end) 603 return; 604 if (unlikely(bi_sector < s->start)) { 605 rel_sector = 0; 606 todo = end_sector - s->start; 607 } else { 608 rel_sector = bi_sector - s->start; 609 todo = end_sector - bi_sector; 610 } 611 if (unlikely(end_sector > s->end)) 612 todo -= (end_sector - s->end); 613 614 offset = dm_sector_div64(rel_sector, s->step); 615 entry = rel_sector; 616 do { 617 if (WARN_ON_ONCE(entry >= s->n_entries)) { 618 DMCRIT("Invalid area access in region id %d", s->id); 619 return; 620 } 621 fragment_len = todo; 622 if (fragment_len > s->step - offset) 623 fragment_len = s->step - offset; 624 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 625 stats_aux, end, duration_jiffies); 626 todo -= fragment_len; 627 entry++; 628 offset = 0; 629 } while (unlikely(todo != 0)); 630} 631 632void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 633 sector_t bi_sector, unsigned bi_sectors, bool end, 634 unsigned long duration_jiffies, 635 struct dm_stats_aux *stats_aux) 636{ 637 struct dm_stat *s; 638 sector_t end_sector; 639 struct dm_stats_last_position *last; 640 bool got_precise_time; 641 642 if (unlikely(!bi_sectors)) 643 return; 644 645 end_sector = bi_sector + bi_sectors; 646 647 if (!end) { 648 /* 649 * A race condition can at worst result in the merged flag being 650 * misrepresented, so we don't have to disable preemption here. 651 */ 652 last = raw_cpu_ptr(stats->last); 653 stats_aux->merged = 654 (bi_sector == (READ_ONCE(last->last_sector) && 655 ((bi_rw == WRITE) == 656 (READ_ONCE(last->last_rw) == WRITE)) 657 )); 658 WRITE_ONCE(last->last_sector, end_sector); 659 WRITE_ONCE(last->last_rw, bi_rw); 660 } 661 662 rcu_read_lock(); 663 664 got_precise_time = false; 665 list_for_each_entry_rcu(s, &stats->list, list_entry) { 666 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 667 if (!end) 668 stats_aux->duration_ns = ktime_to_ns(ktime_get()); 669 else 670 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 671 got_precise_time = true; 672 } 673 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 674 } 675 676 rcu_read_unlock(); 677} 678 679static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 680 struct dm_stat *s, size_t x) 681{ 682 int cpu; 683 struct dm_stat_percpu *p; 684 685 local_irq_disable(); 686 p = &s->stat_percpu[smp_processor_id()][x]; 687 dm_stat_round(s, shared, p); 688 local_irq_enable(); 689 690 shared->tmp.sectors[READ] = 0; 691 shared->tmp.sectors[WRITE] = 0; 692 shared->tmp.ios[READ] = 0; 693 shared->tmp.ios[WRITE] = 0; 694 shared->tmp.merges[READ] = 0; 695 shared->tmp.merges[WRITE] = 0; 696 shared->tmp.ticks[READ] = 0; 697 shared->tmp.ticks[WRITE] = 0; 698 shared->tmp.io_ticks[READ] = 0; 699 shared->tmp.io_ticks[WRITE] = 0; 700 shared->tmp.io_ticks_total = 0; 701 shared->tmp.time_in_queue = 0; 702 703 if (s->n_histogram_entries) 704 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 705 706 for_each_possible_cpu(cpu) { 707 p = &s->stat_percpu[cpu][x]; 708 shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]); 709 shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]); 710 shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]); 711 shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]); 712 shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]); 713 shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]); 714 shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]); 715 shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]); 716 shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]); 717 shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]); 718 shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total); 719 shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue); 720 if (s->n_histogram_entries) { 721 unsigned i; 722 for (i = 0; i < s->n_histogram_entries + 1; i++) 723 shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]); 724 } 725 } 726} 727 728static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 729 bool init_tmp_percpu_totals) 730{ 731 size_t x; 732 struct dm_stat_shared *shared; 733 struct dm_stat_percpu *p; 734 735 for (x = idx_start; x < idx_end; x++) { 736 shared = &s->stat_shared[x]; 737 if (init_tmp_percpu_totals) 738 __dm_stat_init_temporary_percpu_totals(shared, s, x); 739 local_irq_disable(); 740 p = &s->stat_percpu[smp_processor_id()][x]; 741 p->sectors[READ] -= shared->tmp.sectors[READ]; 742 p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 743 p->ios[READ] -= shared->tmp.ios[READ]; 744 p->ios[WRITE] -= shared->tmp.ios[WRITE]; 745 p->merges[READ] -= shared->tmp.merges[READ]; 746 p->merges[WRITE] -= shared->tmp.merges[WRITE]; 747 p->ticks[READ] -= shared->tmp.ticks[READ]; 748 p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 749 p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 750 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 751 p->io_ticks_total -= shared->tmp.io_ticks_total; 752 p->time_in_queue -= shared->tmp.time_in_queue; 753 local_irq_enable(); 754 if (s->n_histogram_entries) { 755 unsigned i; 756 for (i = 0; i < s->n_histogram_entries + 1; i++) { 757 local_irq_disable(); 758 p = &s->stat_percpu[smp_processor_id()][x]; 759 p->histogram[i] -= shared->tmp.histogram[i]; 760 local_irq_enable(); 761 } 762 } 763 cond_resched(); 764 } 765} 766 767static int dm_stats_clear(struct dm_stats *stats, int id) 768{ 769 struct dm_stat *s; 770 771 mutex_lock(&stats->mutex); 772 773 s = __dm_stats_find(stats, id); 774 if (!s) { 775 mutex_unlock(&stats->mutex); 776 return -ENOENT; 777 } 778 779 __dm_stat_clear(s, 0, s->n_entries, true); 780 781 mutex_unlock(&stats->mutex); 782 783 return 1; 784} 785 786/* 787 * This is like jiffies_to_msec, but works for 64-bit values. 788 */ 789static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 790{ 791 unsigned long long result; 792 unsigned mult; 793 794 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 795 return j; 796 797 result = 0; 798 if (j) 799 result = jiffies_to_msecs(j & 0x3fffff); 800 if (j >= 1 << 22) { 801 mult = jiffies_to_msecs(1 << 22); 802 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 803 } 804 if (j >= 1ULL << 44) 805 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 806 807 return result; 808} 809 810static int dm_stats_print(struct dm_stats *stats, int id, 811 size_t idx_start, size_t idx_len, 812 bool clear, char *result, unsigned maxlen) 813{ 814 unsigned sz = 0; 815 struct dm_stat *s; 816 size_t x; 817 sector_t start, end, step; 818 size_t idx_end; 819 struct dm_stat_shared *shared; 820 821 /* 822 * Output format: 823 * <start_sector>+<length> counters 824 */ 825 826 mutex_lock(&stats->mutex); 827 828 s = __dm_stats_find(stats, id); 829 if (!s) { 830 mutex_unlock(&stats->mutex); 831 return -ENOENT; 832 } 833 834 idx_end = idx_start + idx_len; 835 if (idx_end < idx_start || 836 idx_end > s->n_entries) 837 idx_end = s->n_entries; 838 839 if (idx_start > idx_end) 840 idx_start = idx_end; 841 842 step = s->step; 843 start = s->start + (step * idx_start); 844 845 for (x = idx_start; x < idx_end; x++, start = end) { 846 shared = &s->stat_shared[x]; 847 end = start + step; 848 if (unlikely(end > s->end)) 849 end = s->end; 850 851 __dm_stat_init_temporary_percpu_totals(shared, s, x); 852 853 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 854 (unsigned long long)start, 855 (unsigned long long)step, 856 shared->tmp.ios[READ], 857 shared->tmp.merges[READ], 858 shared->tmp.sectors[READ], 859 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 860 shared->tmp.ios[WRITE], 861 shared->tmp.merges[WRITE], 862 shared->tmp.sectors[WRITE], 863 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 864 dm_stat_in_flight(shared), 865 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 866 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 867 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 868 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 869 if (s->n_histogram_entries) { 870 unsigned i; 871 for (i = 0; i < s->n_histogram_entries + 1; i++) { 872 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 873 } 874 } 875 DMEMIT("\n"); 876 877 if (unlikely(sz + 1 >= maxlen)) 878 goto buffer_overflow; 879 880 cond_resched(); 881 } 882 883 if (clear) 884 __dm_stat_clear(s, idx_start, idx_end, false); 885 886buffer_overflow: 887 mutex_unlock(&stats->mutex); 888 889 return 1; 890} 891 892static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 893{ 894 struct dm_stat *s; 895 const char *new_aux_data; 896 897 mutex_lock(&stats->mutex); 898 899 s = __dm_stats_find(stats, id); 900 if (!s) { 901 mutex_unlock(&stats->mutex); 902 return -ENOENT; 903 } 904 905 new_aux_data = kstrdup(aux_data, GFP_KERNEL); 906 if (!new_aux_data) { 907 mutex_unlock(&stats->mutex); 908 return -ENOMEM; 909 } 910 911 kfree(s->aux_data); 912 s->aux_data = new_aux_data; 913 914 mutex_unlock(&stats->mutex); 915 916 return 0; 917} 918 919static int parse_histogram(const char *h, unsigned *n_histogram_entries, 920 unsigned long long **histogram_boundaries) 921{ 922 const char *q; 923 unsigned n; 924 unsigned long long last; 925 926 *n_histogram_entries = 1; 927 for (q = h; *q; q++) 928 if (*q == ',') 929 (*n_histogram_entries)++; 930 931 *histogram_boundaries = kmalloc_array(*n_histogram_entries, 932 sizeof(unsigned long long), 933 GFP_KERNEL); 934 if (!*histogram_boundaries) 935 return -ENOMEM; 936 937 n = 0; 938 last = 0; 939 while (1) { 940 unsigned long long hi; 941 int s; 942 char ch; 943 s = sscanf(h, "%llu%c", &hi, &ch); 944 if (!s || (s == 2 && ch != ',')) 945 return -EINVAL; 946 if (hi <= last) 947 return -EINVAL; 948 last = hi; 949 (*histogram_boundaries)[n] = hi; 950 if (s == 1) 951 return 0; 952 h = strchr(h, ',') + 1; 953 n++; 954 } 955} 956 957static int message_stats_create(struct mapped_device *md, 958 unsigned argc, char **argv, 959 char *result, unsigned maxlen) 960{ 961 int r; 962 int id; 963 char dummy; 964 unsigned long long start, end, len, step; 965 unsigned divisor; 966 const char *program_id, *aux_data; 967 unsigned stat_flags = 0; 968 969 unsigned n_histogram_entries = 0; 970 unsigned long long *histogram_boundaries = NULL; 971 972 struct dm_arg_set as, as_backup; 973 const char *a; 974 unsigned feature_args; 975 976 /* 977 * Input format: 978 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 979 */ 980 981 if (argc < 3) 982 goto ret_einval; 983 984 as.argc = argc; 985 as.argv = argv; 986 dm_consume_args(&as, 1); 987 988 a = dm_shift_arg(&as); 989 if (!strcmp(a, "-")) { 990 start = 0; 991 len = dm_get_size(md); 992 if (!len) 993 len = 1; 994 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 995 start != (sector_t)start || len != (sector_t)len) 996 goto ret_einval; 997 998 end = start + len; 999 if (start >= end) 1000 goto ret_einval; 1001 1002 a = dm_shift_arg(&as); 1003 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 1004 if (!divisor) 1005 return -EINVAL; 1006 step = end - start; 1007 if (do_div(step, divisor)) 1008 step++; 1009 if (!step) 1010 step = 1; 1011 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 1012 step != (sector_t)step || !step) 1013 goto ret_einval; 1014 1015 as_backup = as; 1016 a = dm_shift_arg(&as); 1017 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 1018 while (feature_args--) { 1019 a = dm_shift_arg(&as); 1020 if (!a) 1021 goto ret_einval; 1022 if (!strcasecmp(a, "precise_timestamps")) 1023 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1024 else if (!strncasecmp(a, "histogram:", 10)) { 1025 if (n_histogram_entries) 1026 goto ret_einval; 1027 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) 1028 goto ret; 1029 } else 1030 goto ret_einval; 1031 } 1032 } else { 1033 as = as_backup; 1034 } 1035 1036 program_id = "-"; 1037 aux_data = "-"; 1038 1039 a = dm_shift_arg(&as); 1040 if (a) 1041 program_id = a; 1042 1043 a = dm_shift_arg(&as); 1044 if (a) 1045 aux_data = a; 1046 1047 if (as.argc) 1048 goto ret_einval; 1049 1050 /* 1051 * If a buffer overflow happens after we created the region, 1052 * it's too late (the userspace would retry with a larger 1053 * buffer, but the region id that caused the overflow is already 1054 * leaked). So we must detect buffer overflow in advance. 1055 */ 1056 snprintf(result, maxlen, "%d", INT_MAX); 1057 if (dm_message_test_buffer_overflow(result, maxlen)) { 1058 r = 1; 1059 goto ret; 1060 } 1061 1062 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 1063 n_histogram_entries, histogram_boundaries, program_id, aux_data, 1064 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1065 if (id < 0) { 1066 r = id; 1067 goto ret; 1068 } 1069 1070 snprintf(result, maxlen, "%d", id); 1071 1072 r = 1; 1073 goto ret; 1074 1075ret_einval: 1076 r = -EINVAL; 1077ret: 1078 kfree(histogram_boundaries); 1079 return r; 1080} 1081 1082static int message_stats_delete(struct mapped_device *md, 1083 unsigned argc, char **argv) 1084{ 1085 int id; 1086 char dummy; 1087 1088 if (argc != 2) 1089 return -EINVAL; 1090 1091 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1092 return -EINVAL; 1093 1094 return dm_stats_delete(dm_get_stats(md), id); 1095} 1096 1097static int message_stats_clear(struct mapped_device *md, 1098 unsigned argc, char **argv) 1099{ 1100 int id; 1101 char dummy; 1102 1103 if (argc != 2) 1104 return -EINVAL; 1105 1106 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1107 return -EINVAL; 1108 1109 return dm_stats_clear(dm_get_stats(md), id); 1110} 1111 1112static int message_stats_list(struct mapped_device *md, 1113 unsigned argc, char **argv, 1114 char *result, unsigned maxlen) 1115{ 1116 int r; 1117 const char *program = NULL; 1118 1119 if (argc < 1 || argc > 2) 1120 return -EINVAL; 1121 1122 if (argc > 1) { 1123 program = kstrdup(argv[1], GFP_KERNEL); 1124 if (!program) 1125 return -ENOMEM; 1126 } 1127 1128 r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 1129 1130 kfree(program); 1131 1132 return r; 1133} 1134 1135static int message_stats_print(struct mapped_device *md, 1136 unsigned argc, char **argv, bool clear, 1137 char *result, unsigned maxlen) 1138{ 1139 int id; 1140 char dummy; 1141 unsigned long idx_start = 0, idx_len = ULONG_MAX; 1142 1143 if (argc != 2 && argc != 4) 1144 return -EINVAL; 1145 1146 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1147 return -EINVAL; 1148 1149 if (argc > 3) { 1150 if (strcmp(argv[2], "-") && 1151 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 1152 return -EINVAL; 1153 if (strcmp(argv[3], "-") && 1154 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 1155 return -EINVAL; 1156 } 1157 1158 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 1159 result, maxlen); 1160} 1161 1162static int message_stats_set_aux(struct mapped_device *md, 1163 unsigned argc, char **argv) 1164{ 1165 int id; 1166 char dummy; 1167 1168 if (argc != 3) 1169 return -EINVAL; 1170 1171 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 1172 return -EINVAL; 1173 1174 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 1175} 1176 1177int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 1178 char *result, unsigned maxlen) 1179{ 1180 int r; 1181 1182 /* All messages here must start with '@' */ 1183 if (!strcasecmp(argv[0], "@stats_create")) 1184 r = message_stats_create(md, argc, argv, result, maxlen); 1185 else if (!strcasecmp(argv[0], "@stats_delete")) 1186 r = message_stats_delete(md, argc, argv); 1187 else if (!strcasecmp(argv[0], "@stats_clear")) 1188 r = message_stats_clear(md, argc, argv); 1189 else if (!strcasecmp(argv[0], "@stats_list")) 1190 r = message_stats_list(md, argc, argv, result, maxlen); 1191 else if (!strcasecmp(argv[0], "@stats_print")) 1192 r = message_stats_print(md, argc, argv, false, result, maxlen); 1193 else if (!strcasecmp(argv[0], "@stats_print_clear")) 1194 r = message_stats_print(md, argc, argv, true, result, maxlen); 1195 else if (!strcasecmp(argv[0], "@stats_set_aux")) 1196 r = message_stats_set_aux(md, argc, argv); 1197 else 1198 return 2; /* this wasn't a stats message */ 1199 1200 if (r == -EINVAL) 1201 DMWARN("Invalid parameters for message %s", argv[0]); 1202 1203 return r; 1204} 1205 1206int __init dm_statistics_init(void) 1207{ 1208 shared_memory_amount = 0; 1209 dm_stat_need_rcu_barrier = 0; 1210 return 0; 1211} 1212 1213void dm_statistics_exit(void) 1214{ 1215 if (dm_stat_need_rcu_barrier) 1216 rcu_barrier(); 1217 if (WARN_ON(shared_memory_amount)) 1218 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 1219} 1220 1221module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); 1222MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); 1223